Source code for python_bioinformagicks.tools._call_scSNP

import pysam
import pandas as pd

[docs] def call_scSNP( cell_barcodes: list[str], bam_file: pysam.AlignmentFile | str, contig: str, position: int, alt: str, threshold: int = 9, cb_tag: str = "CB", return_stats: bool = False, ): ''' With a list of valid cell barcodes as reference, scan a BAM file for reads aligning to a given genomic position and perform variant calling at that position for each cell. This function is crude and useful only for single point mutation calling, i.e. "I know there is a mutation this position; tell me which cells have it." Consider using more comprehensive tools such as `cellsnp-lite`. Parameters ---------- cell_barcodes: list of str A list of valid cell barcodes bam_file: pysam.AlignmentFile or str An instantiated pysam.AlignmentFile object representing an open, indexed BAM file. If str, function will open using: `f = pysam.AlignmentFile(bam_file, "rb")` contig: str The contig containing the genomic position of interest. position: int > 0 The 1-indexed position on the contig of the genomic position of interest. alt: str The alternate nucleotide. Number of reads with this alt nucleotide will be compared to total number of reads to perform variant calling. One of `["A", "G", "T", "C"]`. threshold: int (default: 9) The minimum number of reads covering the region of needed to perform variant calling. If `#reads < threshold`, call NA. If `#reads(ALT) < (threshold/3)`, call WT. Else, call alt allele. cb_tag: str (default: "CB") The tag in the BAM file that stores the (corrected) cell barcodes. Usually one of ["CB", "CR"]. return_stats: bool (default: False) If True, return dictionary of variant calling statistics, including per-cell allele frequencies. Returns ------- variant_calls: list of str List corresponding 1-to-1 with list of cell barcodes. stats: dict (when `return_stats is True`) Dictionary of variant calling statistics References ---------- * https://doi.org/10.1093/bioinformatics/btab358 * https://github.com/single-cell-genetics/cellsnp-lite ''' # Open the BAM file if necessary if (type(bam_file) is str): try: bam_file = pysam.AlignmentFile(bam_file, "rb") except Exception as e: print("Unable to open bam file:") print(e) return None # validate alt if (alt not in ["A", "G", "T", "C"]): print("alt must be one of [A,G,T,C]") return None # Get reads in region position = position - 1 # Pysam 0-indexes, correct it here start_pos = position - 1 end_pos = position + 1 reads_in_region = bam_file.fetch( contig, start=start_pos, end=end_pos ) # Generate dictionary of reads read_dict = {} for barcode in cell_barcodes: read_dict[barcode] = [] for read in reads_in_region: barcode = read.get_tag(cb_tag) if (barcode in read_dict.keys()): seq = read.get_forward_sequence() # only insert the single nucleotide of interest # into the dictionary, instead of the whole read reference_positions = read.get_reference_positions() for j, pos in enumerate(reference_positions): if (pos == (position)): read_dict[barcode].append(seq[j]) break # Pileup the reads pileup_dict = {} for k,v in read_dict.items(): pileup_dict[k] = { "C": 0, "T": 0, "G": 0, "A": 0, "N": 0, } if (v): for base in v: if (base in pileup_dict[k].keys()): pileup_dict[k][base] += 1 df = pd.DataFrame.from_dict(pileup_dict).T variant_calls = df.apply( lambda x: _do_call_thresholding(x, alt, threshold), axis=1 ).tolist() if (return_stats): return variant_calls, pileup_dict else: return variant_calls
def _do_call_thresholding(x, alt, threshold): if (x.sum() < threshold): return "NA" elif (x[alt] < (threshold/3.0)): return "WT" else: return alt