Skip to content

Download utils

Utility functions aimed at downloading any data from external sources.

download_cloud_directory(cloud_directory, output_folder, cloud='gs') #

Download a given cloud directory.

Parameters:

Name Type Description Default
cloud_directory str

for example gs://bucket-name/path/to/directory

required
output_folder str

where the data downloaded will be stored (ideally data/ folder)

required
cloud str

the cloud provider, currently only "gs" is supported

'gs'
Source code in src/utils/download_utils.py
27
28
29
30
31
32
33
34
35
def download_cloud_directory(cloud_directory: str, output_folder: str, cloud: str = "gs") -> None:
    """Download a given cloud directory.

    Args:
        cloud_directory: for example gs://bucket-name/path/to/directory
        output_folder: where the data downloaded will be stored (ideally data/ folder)
        cloud: the cloud provider, currently only "gs" is supported
    """
    cloudpathlib.Path(cloud_directory).download_to(output_folder)

download_kaggle_dataset(dataset_name, output_folder) #

Download a given Kaggle dataset.

Parameters:

Name Type Description Default
dataset_name str

for example googleai/pfam-seed-random-split

required
output_folder str

where the data downloaded will be stored (ideally data/ folder)

required
Source code in src/utils/download_utils.py
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
def download_kaggle_dataset(dataset_name: str, output_folder: str) -> None:
    """Download a given Kaggle dataset.

    Args:
        dataset_name: for example googleai/pfam-seed-random-split
        output_folder: where the data downloaded will be stored (ideally data/ folder)
    """
    from kaggle.api.kaggle_api_extended import KaggleApi

    api = KaggleApi()
    log.info("Authenticating to Kaggle API")
    api.authenticate()
    log.info("Downloading dataset")
    api.dataset_download_files(dataset_name, path=output_folder, unzip=True, quiet=False)
    log.info("Download successful")