Source code for GenomeUtils.downloaders.genome_downloader

#!/usr/bin/env python
"""
Filename: GenomeUtils/downloaders/genome_downloader.py
Author: Arash Ayat
Copyright: 2025, Alexander Schliep
Version: 0.1.1
Description: This file defines the abstract base class for genome downloaders.
License: LGPL-3.0-or-later
"""

from __future__ import annotations

from pathlib import Path

import gget

from .downloader import Downloader


[docs] class EnsemblGenomeDownloader(Downloader): """ Downloads genome data from Ensembl. This downloader fetches the download URLs for genomic data using `gget`, downloads the files, and stores them in `genomes_root_dir/ensembl/{assembly_id}/{ensembl_release}`. """ def __init__(self, assembly_id: str, ensembl_release: int, species: str, genomes_root_dir: Path | str = Path('./data/genomes') ): """ Initializes the EnsemblGenomeDownloader. Args: assembly_id: The identifier for the genome assembly (e.g., 'GRCh38'). ensembl_release: The release number of the Ensembl database. species: The scientific name for the species (e.g., 'homo_sapiens'). genomes_root_dir: The parent directory to store all downloaded genomes. Defaults to './data/genomes'. """ self.ensembl_release = ensembl_release self.assembly_id = assembly_id self.species = species self.genomes_root_dir = Path(genomes_root_dir) genome_dir = self.genomes_root_dir / 'ensembl' / assembly_id / str(ensembl_release) super().__init__(genome_dir) def __repr__(self) -> str: return (f"{self.__class__.__name__}(" f"assembly_id={self.assembly_id}, " f"ensembl_release={self.ensembl_release}, " f"species={self.species}, " f"genomes_root_dir={self.genomes_root_dir})")
[docs] def download(self) -> dict[str, Path]: """ Downloads all necessary genome files using gget to retrieve the URLs. Returns: A dictionary mapping a file type to the local Path. Keys are `dna`, `cdna`, and `annotation`. """ gtf_url, cdna_url, dna_url = tuple( gget.ref(self.species, which=["gtf", "cdna", "dna"], release=self.ensembl_release, ftp=True, verbose=False) ) dna_path = self.download_file(dna_url, Path(dna_url).name) cdna_path = self.download_file(cdna_url, Path(cdna_url).name) annotation_path = self.download_file(gtf_url, Path(gtf_url).name) return { 'dna': dna_path, 'cdna': cdna_path, 'annotation': annotation_path, }