From 64425100ef385ea526a8aeb5af4b6a3b944fa738 Mon Sep 17 00:00:00 2001 From: Keshav Date: Sun, 23 Jun 2024 17:10:15 -0500 Subject: [PATCH] Deleting old quality score simulation model. --- neat/models/markov.py | 142 ------------------------------------------ 1 file changed, 142 deletions(-) delete mode 100644 neat/models/markov.py diff --git a/neat/models/markov.py b/neat/models/markov.py deleted file mode 100644 index 6e9dcae..0000000 --- a/neat/models/markov.py +++ /dev/null @@ -1,142 +0,0 @@ -import pysam -import numpy as np -import pandas as pd -import pathlib -import matplotlib.pyplot as plt -import seaborn as sns -from scipy.stats import norm - -def make_qual_score_list(bam_file): - '''Takes an input BAM file and creates lists of quality scores. This becomes a data frame, which will be - pre-processed for Markov chain analysis.''' - - index = f'{bam_file}.bai' - - if not pathlib.Path(index).exists(): - print('No index found, creating one.') - - pysam.index(bam_file) - - file_to_parse = pysam.AlignmentFile(bam_file, 'rb', check_sq=False) - num_recs = file_to_parse.count() - print(f'{num_recs} records to parse') - - modulo = round(num_recs / 9) - - qual_list = [] - i = 0 - j = 0 - - def print_update(number, factor, percent): - if number % factor == 0: - percent += 10 - print(f'{percent}% complete', end='\r') - return percent - - print('Parsing file') - - for item in file_to_parse.fetch(): - if item.is_unmapped or len(item.seq) != 249 or 'S' in item.cigarstring: - i += 1 - j = print_update(i, modulo, j) - continue - - # Mapping quality scores - - align_qual = item.query_alignment_qualities - - # Append to master lists - - qual_list.append(align_qual) - i += 1 - j = print_update(i, modulo, j) - - print(f'100% complete') - file_to_parse.close() - - # Turn list of lists into a dataframe - - quality_df = pd.DataFrame(qual_list) - - # Pre-processing - fill in missing data and sample 1% of reads - - quality_df = quality_df.fillna(0) - quality_df = quality_df.sample(frac=0.01, axis=0, random_state=42) - - return quality_df - -def estimate_transition_probabilities(): - - # Define the probabilities for transition states based on a normal distribution - - std_dev = 1 - transition_probs = { - -3: norm.pdf(-3, 0, std_dev), - -2: norm.pdf(-2, 0, std_dev), - -1: norm.pdf(-1, 0, std_dev), - 0: norm.pdf(0, 0, std_dev), - 1: norm.pdf(1, 0, std_dev), - 2: norm.pdf(2, 0, std_dev), - 3: norm.pdf(3, 0, std_dev) - } - - # Normalize the probabilities to sum to 1 - - total_prob = sum(transition_probs.values()) - for k in transition_probs: - transition_probs[k] /= total_prob - - return transition_probs - -def apply_markov_chain(quality_df, L=249): - - transition_probs = estimate_transition_probabilities() - num_rows, num_cols = quality_df.shape - - markov_preds = [] - - for row in quality_df.iterrows(): - qualities = row[1].values - pred_qualities = np.zeros_like(qualities) - pred_qualities[0] = qualities[0] # initial state - - for i in range(1, L): - prev_quality = pred_qualities[i - 1] - transitions = list(transition_probs.keys()) - probabilities = list(transition_probs.values()) - next_quality = np.random.choice(transitions, p=probabilities) - pred_qualities[i] = max(0, prev_quality + next_quality) # ensuring no negative qualities - - markov_preds.append(pred_qualities) - - markov_preds_df = pd.DataFrame(markov_preds) - - # Apply final transformations - - edge_len = int(L * 0.05) - mid_start = int(L * 0.40) - mid_end = int(L * 0.60) - - markov_preds_df.iloc[:, :edge_len] -= 5 - markov_preds_df.iloc[:, :edge_len] = markov_preds_df.iloc[:, :edge_len].clip(lower=0) - - markov_preds_df.iloc[:, -edge_len:] -= 5 - markov_preds_df.iloc[:, -edge_len:] = markov_preds_df.iloc[:, -edge_len:].clip(lower=0) - - markov_preds_df.iloc[:, mid_start:mid_end] += 1 - - return markov_preds_df - -def plot_heatmap(y_preds_df, file_path): - '''Takes a dataframe of predicted quality scores and plots a seaborn heatmap to visualize them.''' - - sns.heatmap(y_preds_df, vmin=0, vmax=40, cmap='viridis') - plt.savefig(file_path) - print('Heatmap plotted') - -# Example usage - -bam_file = '/projects/bclt/neat_data/H1N1_new.bam' -test_df = make_qual_score_list(bam_file) -markov_preds_df = apply_markov_chain(test_df) -plot_heatmap(markov_preds_df, 'markov_chain_heatmap.svg')