Source code for pyhiv.report.reporter

"""
Main reporting class for PyHIV results.
"""

from pathlib import Path
import logging

import pandas as pd
from matplotlib.backends.backend_pdf import PdfPages

from .pdf_generator import render_sequence_page
from pyhiv.report.constants import K03455Config
from pyhiv.report.utils import (
    read_alignment_fasta, parse_present_regions, parse_features,
    is_special_reference, normalize_features, normalize_present_regions,
    build_ref_to_alignment_map, project_features_to_alignment,
    build_alignment_path
)


[docs] class PyHIVReporter: """Main class for generating PyHIV PDF reports.""" def __init__(self, output_dir: Path, log_level=logging.INFO): """ Initialize the reporter with output directory and logger. Parameters ---------- output_dir : Path Directory to save the generated PDF report. log_level : int, optional Logging level, by default logging.INFO """ self.output_dir = Path(output_dir) self.output_dir.mkdir(parents=True, exist_ok=True) # Configure logger self.logger = logging.getLogger(self.__class__.__name__) self.logger.setLevel(log_level) # Avoid adding duplicate handlers if multiple instances are created if not self.logger.handlers: handler = logging.StreamHandler() formatter = logging.Formatter( fmt="%(asctime)s - %(name)s - %(levelname)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S" ) handler.setFormatter(formatter) self.logger.addHandler(handler)
[docs] def generate_report( self, final_table_path: Path, sequences_with_locations_path: Path, output_pdf_name: str = "PyHIV_report_all_sequences.pdf" ) -> Path: """ Generate PDF report from PyHIV results. Parameters ---------- final_table_path : Path Path to final_table.tsv file. sequences_with_locations_path : Path Path to sequences_with_locations.tsv file. output_pdf_name : str, optional Name of the output PDF file, by default "PyHIV_report_all_sequences.pdf" Returns ------- Path Path to the generated PDF report. """ # Read input data self.logger.info(f"Reading {final_table_path} and {sequences_with_locations_path}...") ft = pd.read_csv(final_table_path, sep="\t") required = ["Sequence", "Reference", "Subtype", "Most Matching Gene Region", "Present Gene Regions"] missing = [c for c in required if c not in ft.columns] if missing: raise ValueError(f"Missing columns in final_table: {missing}") swl = pd.read_csv(sequences_with_locations_path, sep="\t") if "accession" not in swl.columns or "features" not in swl.columns: raise ValueError("sequences_with_locations.tsv must include 'accession' and 'features' columns") # Parse features by accession features_by_acc = {} for _, row in swl.iterrows(): acc = str(row["accession"]) try: features_by_acc[acc] = parse_features(row["features"]) except Exception as e: self.logger.warning(f"Failed to parse features for accession {acc}: {e}") features_by_acc[acc] = {} # Generate PDF output_pdf_path = self.output_dir / output_pdf_name self.logger.info(f"Generating PDF report: {output_pdf_path}") with PdfPages(output_pdf_path) as pdf: pages_made = 0 for _, r in ft.iterrows(): sequence = str(r["Sequence"]) accession = str(r["Reference"]) subtype = str(r["Subtype"]) mm_region = str(r["Most Matching Gene Region"]) if "Most Matching Gene Region" in r else "-" present_regions_raw = parse_present_regions(r.get("Present Gene Regions", "")) # Find alignment file fasta_path = build_alignment_path(sequence, self.output_dir) if not fasta_path.exists(): self.logger.warning(f"Alignment FASTA not found for {sequence}: {fasta_path}") continue try: ref_header, ref_seq_aln, user_header, user_seq_aln = read_alignment_fasta(fasta_path) except Exception as e: self.logger.error(f"Error reading {fasta_path}: {e}") continue special = is_special_reference(accession, ref_header) ref_map, _ = build_ref_to_alignment_map(ref_seq_aln) raw_features = features_by_acc.get(accession, {}) features_genomic = normalize_features(raw_features, special) present_regions = normalize_present_regions(present_regions_raw, special) features_aln = project_features_to_alignment(features_genomic, ref_map) if special: features_aln = {k: v for k, v in features_aln.items() if k in K03455Config.TARGET_REGIONS} present_regions = [g for g in present_regions if g in features_aln] y_pos = K03455Config.Y_POSITIONS else: y_pos = None # non-K03455 → auto lanes + configurable numeric offsets + fixed x >= 10,000 render_sequence_page( pdf=pdf, sequence=sequence, accession=accession, subtype=subtype, mm_region=mm_region if mm_region != "-" else "", present_regions=present_regions, features_aln=features_aln, ref_seq_aligned=ref_seq_aln, user_seq_aligned=user_seq_aln, y_positions=y_pos, ) pages_made += 1 self.logger.info( f"Added page for {sequence} (special={special}, features={list(features_aln.keys())})" ) if pages_made == 0: self.logger.warning("No pages created. Check your paths and file formats.") else: self.logger.info(f"PDF created: {output_pdf_path} (pages: {pages_made})") return output_pdf_path