__version__ = "0.1.0"
import ast
from pathlib import Path
import pandas as pd
from pyhiv.align import align_with_references
from pyhiv.config import get_reference_paths, validate_reference_paths
from pyhiv.loading import read_input_fastas
from pyhiv.split import get_gene_region, get_present_gene_regions, map_ref_coords_to_alignment
from pyhiv.report import PyHIVReporter
import logging
FINAL_TABLE_COLUMNS = ['Sequence', 'Reference', 'Subtype', 'Most Matching Gene Region', 'Present Gene Regions']
[docs]
def PyHIV(fastas_dir: str, subtyping: bool = True, splitting: bool = True,
output_dir: str = None, n_jobs: int = None, reporting: bool = True):
"""
Main function to run the PyHIV pipeline.
It aligns the user sequences with the reference sequences and saves the
best alignment in a fasta file. If subtyping is True, it aligns the user
sequences with the reference sequences from the HIV-1 subtyping tool.
If splitting is True, it splits the user sequences into gene regions
and saves them in specific folders. It also saves a final table with the results.
If reporting is True, it generates a PDF report with visualizations.
"""
paths = get_reference_paths()
validate_reference_paths(paths)
fastas_dir = Path(fastas_dir)
output_dir = Path(output_dir) if output_dir else Path('PyHIV_results')
output_dir.mkdir(parents=True, exist_ok=True)
user_fastas = read_input_fastas(fastas_dir)
reference_sequences = pd.read_csv(paths["SEQUENCES_WITH_LOCATION"], sep='\t')
final_table = pd.DataFrame(columns=FINAL_TABLE_COLUMNS)
for fasta in user_fastas:
reference_dir = (
paths["REFERENCE_GENOMES_FASTAS_DIR"]
if subtyping else
paths["HXB2_GENOME_FASTA_DIR"]
)
best_alignment = align_with_references(fasta, references_dir=reference_dir, n_jobs=n_jobs)
if best_alignment is None:
continue
sequence_name = fasta.id
test_aligned, ref_aligned, ref_file = best_alignment
# Extract reference information
ref_file_parts = Path(ref_file).stem.split('-')
accession = ref_file_parts[0]
subtype = ref_file_parts[1] if len(ref_file_parts) > 1 else "Unknown"
# Retrieve gene ranges
gene_ranges = ast.literal_eval(
reference_sequences.loc[
reference_sequences['accession'] == accession, 'features'
].values[0]
)
# save a fasta file with the best alignment
final_alignment_file = output_dir / f"best_alignment_{sequence_name}.fasta"
with open(final_alignment_file, 'w') as output_file:
output_file.write(
f">Reference {Path(ref_file).stem}\n{ref_aligned}\n>{sequence_name}\n{test_aligned}\n"
)
if splitting:
mapping = map_ref_coords_to_alignment(ref_aligned)
aligned_gene_ranges = {
gene: (mapping[start], mapping[end])
for gene, (start, end) in gene_ranges.items()
if start in mapping and end in mapping
}
# get gene region with most matches
region = get_gene_region(test_aligned, ref_aligned, aligned_gene_ranges)
# get gene regions with base pair letters
present_regions = get_present_gene_regions(test_aligned, aligned_gene_ranges)
# Save gene regions fasta in each region-specific folder
for gene in present_regions:
gene_path = output_dir / gene
gene_path.mkdir(parents=True, exist_ok=True)
gene_file = gene_path / f"{sequence_name}_{gene}.fasta"
with open(gene_file, 'w') as output_file:
aln_start, aln_end = aligned_gene_ranges[gene]
seq_fragment = test_aligned[aln_start:aln_end+1]
output_file.write(f'>{sequence_name}\n{seq_fragment}\n')
# Save the results in the final global table
row_data = [sequence_name, accession, subtype, str(region).strip("[]"), str(present_regions).strip("[]")]
else:
row_data = [sequence_name, accession, subtype, "-", "-"]
final_table = pd.concat(
[final_table, pd.DataFrame([row_data], columns=final_table.columns)],
ignore_index=True
)
if not splitting:
final_table.drop(columns=['Most Matching Gene Region', 'Present Gene Regions'], inplace=True)
final_table.to_csv(output_dir / 'final_table.tsv', sep='\t', index=False)
# Generate PDF report if requested
if reporting:
try:
reporter = PyHIVReporter(output_dir, subtyping=subtyping, splitting=splitting)
sequences_with_locations_path = paths["SEQUENCES_WITH_LOCATION"]
pdf_path = reporter.generate_report(
final_table_path=output_dir / 'final_table.tsv',
sequences_with_locations_path=sequences_with_locations_path,
output_pdf_name="PyHIV_report_all_sequences.pdf"
)
logging.info(f"PDF report generated: {pdf_path}")
except Exception as e: # pragma: no cover
logging.exception("Error generating PDF report — continuing without it.")