Source code for pyhiv.loading.read_fastas
import logging
from pathlib import Path
from typing import List
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
[docs]
def read_input_fastas(input_folder: Path) -> List[SeqRecord]:
"""
Reads nucleotide FASTA files (.fasta, .fa, .fna, .ffn) from a specified input folder.
Parameters
----------
input_folder : Path
Path to the folder containing the FASTA files.
Returns
-------
List[SeqRecord]
A list of BioPython SeqRecord objects containing sequence IDs and sequences.
Raises
------
NotADirectoryError
If the input folder does not exist or is not a directory.
"""
if not input_folder.is_dir():
raise NotADirectoryError(f"Input folder {input_folder} is not a directory.")
supported_extensions = (".fasta", ".fa", ".fna", ".ffn")
sequences = []
fasta_files = [f for f in input_folder.iterdir() if f.suffix.lower() in supported_extensions]
if not fasta_files:
logging.warning(f"No FASTA files with supported extensions found in {input_folder}.")
for fasta_file in fasta_files:
try:
with open(fasta_file, "r") as handle:
records = list(SeqIO.parse(handle, "fasta"))
if not records:
logging.warning(f"File {fasta_file} contains no valid sequences.")
else:
sequences.extend(records)
logging.info(f"Successfully read {len(records)} sequences from {fasta_file}")
except Exception as e:
logging.error(f"Error reading {fasta_file}: {e}")
return sequences