Source code for pyhiv

__version__ = "0.1.0"

import ast
from pathlib import Path
import pandas as pd

from pyhiv.align import align_with_references
from pyhiv.config import get_reference_paths, validate_reference_paths
from pyhiv.loading import read_input_fastas
from pyhiv.split import get_gene_region, get_present_gene_regions, map_ref_coords_to_alignment
from pyhiv.report import PyHIVReporter
import logging

FINAL_TABLE_COLUMNS = ['Sequence', 'Reference', 'Subtype', 'Most Matching Gene Region', 'Present Gene Regions']


[docs]
def PyHIV(fastas_dir: str, subtyping: bool = True, splitting: bool = True,
          output_dir: str = None, n_jobs: int = None, reporting: bool = True):
    """
    Main function to run the PyHIV pipeline.
    It aligns the user sequences with the reference sequences and saves the
    best alignment in a fasta file. If subtyping is True, it aligns the user
    sequences with the reference sequences from the HIV-1 subtyping tool.
    If splitting is True, it splits the user sequences into gene regions
    and saves them in specific folders. It also saves a final table with the results.
    If reporting is True, it generates a PDF report with visualizations.
    """
    paths = get_reference_paths()
    validate_reference_paths(paths)

    fastas_dir = Path(fastas_dir)
    output_dir = Path(output_dir) if output_dir else Path('PyHIV_results')
    output_dir.mkdir(parents=True, exist_ok=True)

    user_fastas = read_input_fastas(fastas_dir)
    reference_sequences = pd.read_csv(paths["SEQUENCES_WITH_LOCATION"], sep='\t')

    final_table = pd.DataFrame(columns=FINAL_TABLE_COLUMNS)

    for fasta in user_fastas:
        reference_dir = (
            paths["REFERENCE_GENOMES_FASTAS_DIR"]
            if subtyping else
            paths["HXB2_GENOME_FASTA_DIR"]
        )
        best_alignment = align_with_references(fasta, references_dir=reference_dir, n_jobs=n_jobs)

        if best_alignment is None:
            continue

        sequence_name = fasta.id
        test_aligned, ref_aligned, ref_file = best_alignment



        # Extract reference information
        ref_file_parts = Path(ref_file).stem.split('-')
        accession = ref_file_parts[0]
        subtype = ref_file_parts[1] if len(ref_file_parts) > 1 else "Unknown"

        # Retrieve gene ranges
        gene_ranges = ast.literal_eval(
            reference_sequences.loc[
                reference_sequences['accession'] == accession, 'features'
            ].values[0]
        )

        # save a fasta file with the best alignment
        final_alignment_file = output_dir / f"best_alignment_{sequence_name}.fasta"
        with open(final_alignment_file, 'w') as output_file:
            output_file.write(
                f">Reference {Path(ref_file).stem}\n{ref_aligned}\n>{sequence_name}\n{test_aligned}\n"
            )

        if splitting:

            mapping = map_ref_coords_to_alignment(ref_aligned)

            aligned_gene_ranges = {
                gene: (mapping[start], mapping[end])
                for gene, (start, end) in gene_ranges.items()
                if start in mapping and end in mapping
            }

            # get gene region with most matches
            region = get_gene_region(test_aligned, ref_aligned, aligned_gene_ranges)
            # get gene regions with base pair letters
            present_regions = get_present_gene_regions(test_aligned, aligned_gene_ranges)

            # Save gene regions fasta in each region-specific folder
            for gene in present_regions:
                gene_path = output_dir / gene
                gene_path.mkdir(parents=True, exist_ok=True)
                gene_file = gene_path / f"{sequence_name}_{gene}.fasta"
                with open(gene_file, 'w') as output_file:
                    aln_start, aln_end = aligned_gene_ranges[gene]
                    seq_fragment = test_aligned[aln_start:aln_end+1]
                    output_file.write(f'>{sequence_name}\n{seq_fragment}\n')

            # Save the results in the final global table
            row_data = [sequence_name, accession, subtype, str(region).strip("[]"), str(present_regions).strip("[]")]
        else:
            row_data = [sequence_name, accession, subtype, "-", "-"]

        final_table = pd.concat(
            [final_table, pd.DataFrame([row_data], columns=final_table.columns)],
            ignore_index=True
        )
    if not splitting:
        final_table.drop(columns=['Most Matching Gene Region', 'Present Gene Regions'], inplace=True)

    final_table.to_csv(output_dir / 'final_table.tsv', sep='\t', index=False)
    
    # Generate PDF report if requested
    if reporting:
        try:
            reporter = PyHIVReporter(output_dir, subtyping=subtyping, splitting=splitting)
            sequences_with_locations_path = paths["SEQUENCES_WITH_LOCATION"]
            pdf_path = reporter.generate_report(
                final_table_path=output_dir / 'final_table.tsv',
                sequences_with_locations_path=sequences_with_locations_path,
                output_pdf_name="PyHIV_report_all_sequences.pdf"
            )
            logging.info(f"PDF report generated: {pdf_path}")
        except Exception as e: # pragma: no cover
            logging.exception("Error generating PDF report — continuing without it.")