From d389ada333b6411e101583f77fc8d6f60e2cbd90 Mon Sep 17 00:00:00 2001 From: Joshua Allen Date: Mon, 13 May 2024 22:18:52 -0500 Subject: [PATCH 1/8] Making progress simplifying and strengthening the generate reads section --- neat/models/default_fraglen_model.py | 8 - neat/models/models.py | 42 +- neat/read_simulator/runner.py | 17 +- neat/read_simulator/utils/generate_reads.py | 380 +++++------------- neat/read_simulator/utils/options.py | 13 +- ...over_dataset.py => test_generate_reads.py} | 63 +-- 6 files changed, 151 insertions(+), 372 deletions(-) delete mode 100644 neat/models/default_fraglen_model.py rename tests/test_read_simulator/{test_cover_dataset.py => test_generate_reads.py} (73%) diff --git a/neat/models/default_fraglen_model.py b/neat/models/default_fraglen_model.py deleted file mode 100644 index 5b3ef168..00000000 --- a/neat/models/default_fraglen_model.py +++ /dev/null @@ -1,8 +0,0 @@ -""" -The following model is the default for the fragment length distribution. -""" - -default_fragment_mean = 601.0 -default_fragment_std = 103.0 -default_fragment_max = 1626.0 -default_fragment_min = 3.0 \ No newline at end of file diff --git a/neat/models/models.py b/neat/models/models.py index f2a26b0b..fef6f270 100644 --- a/neat/models/models.py +++ b/neat/models/models.py @@ -21,7 +21,6 @@ from .default_mutation_model import * from .default_sequencing_error_model import * from .default_gc_bias_model import * -from .default_fraglen_model import * from .utils import bin_scores, take_closest __all__ = [ @@ -600,53 +599,28 @@ class FragmentLengthModel: :param fragment_mean: the mean of the collection of fragment lengths derived from data :param fragment_std: the standard deviation of the collection of fragment lengths derived from data - :param fragment_max: the largest fragment observed in the data - :param fragment_min: the smallest fragment observed in data :param rng: the random number generator for the run """ def __init__(self, - fragment_mean: float = None, - fragment_std: float = None, - fragment_max: int = None, - fragment_min: int = None, + fragment_mean: float, + fragment_std: float, rng: Generator = None): - self.fragment_mean = fragment_mean if fragment_mean else default_fragment_mean - self.fragment_st_dev = fragment_std if fragment_std else default_fragment_std - self.fragment_max = fragment_max if fragment_max else default_fragment_max - self.fragment_min = fragment_min if fragment_min else default_fragment_min + self.fragment_mean = fragment_mean + self.fragment_st_dev = fragment_std self.rng = rng def generate_fragments(self, total_length: int, - read_length: int, - coverage: int) -> list: + number_of_fragments: int) -> list: """ Generates a number of fragments based on the total length needed, and the mean and standard deviation of the set :param total_length: Length of the reference segment we are covering. - :param read_length: average length of the reads - :param coverage: the target coverage number + :param number_of_fragments: The number of fragments needed. :return: A list of fragment random fragment lengths sampled from the model. """ - # Estimate the number of fragments needed (with a 2x padding) - number_of_fragments = int(round(total_length / read_length) * (coverage * 2)) - # Check that we don't have unusable values for fragment mean. Too many fragments under the read length means - # NEAT will either get caught in an infinite cycle of sampling fragments but never finding one that works, or - # it will only find a few and will run very slowly. - if self.fragment_mean < read_length: - # Let's just reset the fragment mean to make up for this. - self.fragment_mean = read_length # generates a distribution, assuming normality, then rounds the result and converts to ints dist = np.round(self.rng.normal(self.fragment_mean, self.fragment_st_dev, size=number_of_fragments)).astype(int) - # filter the list to throw out outliers and to set anything under the read length to the read length. - dist = [max(x, read_length) for x in dist if x <= self.fragment_max] - # Just a sanity check to make sure our data isn't too thin: - while number_of_fragments - len(dist) > 0: - additional_fragment = self.rng.normal(loc=self.fragment_mean, scale=self.fragment_st_dev) - if additional_fragment < read_length: - continue - dist.append(round(additional_fragment)) - - # Now set a minimum on the dataset. Any fragment smaller than read_length gets turned into read_length - return dist + + return list(dist) diff --git a/neat/read_simulator/runner.py b/neat/read_simulator/runner.py index 43d0836e..2e9f4965 100644 --- a/neat/read_simulator/runner.py +++ b/neat/read_simulator/runner.py @@ -98,15 +98,16 @@ def initialize_all_models(options: Options): _LOG.debug('GC Bias model loaded') - fraglen_model = None - if options.paired_ended: - if options.fragment_model: - fraglen_model = pickle.load(gzip.open(options.fragment_model)) - else: - fraglen_model = FragmentLengthModel(options.fragment_mean, options.fragment_st_dev) - - # Set the rng for the fragment length model + if options.fragment_model: + fraglen_model = pickle.load(gzip.open(options.fragment_model)) fraglen_model.rng = options.rng + elif options.fragment_mean: + fraglen_model = FragmentLengthModel(options.fragment_mean, options.fragment_st_dev, rng=options.rng) + else: + # For single ended, fragment length will be based on read length + fragment_mean = options.read_len * 1.5 + fragment_st_dev = fragment_mean * 0.2 + fraglen_model = FragmentLengthModel(fragment_mean, fragment_st_dev, options.rng) _LOG.debug("Fragment length model loaded") diff --git a/neat/read_simulator/utils/generate_reads.py b/neat/read_simulator/utils/generate_reads.py index 96b3c8a4..7a4bc699 100644 --- a/neat/read_simulator/utils/generate_reads.py +++ b/neat/read_simulator/utils/generate_reads.py @@ -3,7 +3,7 @@ import pickle import numpy as np -from math import ceil, floor +from math import ceil from pathlib import Path from Bio import SeqRecord from Bio.Seq import Seq @@ -17,7 +17,8 @@ from .read import Read __all__ = [ - 'generate_reads' + 'generate_reads', + 'cover_dataset', ] _LOG = logging.getLogger(__name__) @@ -36,15 +37,18 @@ def is_too_many_n(segment): return n / len(segment) >= 0.2 +""" +Currently this code block is unused, but I think it might be useful for parallelization, so I wanted to leave it in. + def create_windows(sequence: SeqRecord, size: int, overlap: int): - """ - Create a list of windows. We might need this for parallelization, so I'm leaving the code in place + \""" + Create a list of windows. Currently Unused. We might need this for parallelization, so I'm leaving the code in place :param sequence: Sequence to split :param size: size of windows :param overlap: size of overlap between windows :return: list of windows - """ + \""" windows = [] for i in range(0, len(sequence), size): if i < overlap and i + size + overlap < len(sequence): @@ -55,13 +59,12 @@ def create_windows(sequence: SeqRecord, size: int, overlap: int): windows.append((i, len(sequence))) return windows +""" def cover_dataset( span_length: int, - target_vector: np.ndarray, options: Options, - gc_bias_model: GcModel, fragment_model: FragmentLengthModel | None, ) -> list: """ @@ -69,211 +72,79 @@ def cover_dataset( to the proper coverage depth. It uses an abstract representation of the reads, by end points. :param span_length: The total length the cover needs to span - :param target_vector: The vector of coverage targets by position :param options: The options for the run - :param gc_bias_model: The gc-bias model being used for this run. :param fragment_model: The fragment model used for to generate random fragment lengths """ - # Variables to track reads and current coverage - temp_reads = [] - final_coverage = [0] * len(target_vector) - final_reads = [] - - # Determine potential fragment lengths - if fragment_model: - # Paired ended case - fragment_pool = fragment_model.generate_fragments(span_length, options.read_len, options.coverage) - else: - # single ended case - fragment_pool = [options.read_len] - - # We need a window to cover. If window size of the gc model is greater than read length, then we'll call it 1, - # meaning 1 section of the target vector. - read_window_width = max(max(fragment_pool)//gc_bias_model.window_size, 1) - - # A quarter of a read length - quarter_read = options.read_len//4 - # for each position, the set of reads covering that position. - for i in range(len(target_vector)-read_window_width): - section_id = (i, i+read_window_width) - subsection_target_vector = target_vector[i: i+read_window_width] - subsection_target = ceil(np.mean(subsection_target_vector)) - - # Check if we even need to bother this section - section_max = max(final_coverage[i: i+read_window_width]) - if section_max >= subsection_target: - # we have enough reads here, move on. - i += options.rng.choice(list(range(quarter_read))) - continue - - # We need to know where we are along the reference contig. We are i * read_window_widths in, - # and each read_window is gc_bias_model.window_size long. - start_coordinate = i * read_window_width * gc_bias_model.window_size - # read_count counts the reads added to this section - read_count = 0 - # 1.5x as many as needed to give the final set a bit of randomness. - temp_pool_size = ceil(subsection_target * 1.5) - # debug variable - k = 0 - while read_count < temp_pool_size: - # The structure for these reads will be (left_start, left_end, right_start, right_end) - # where start and end are ints with end > start. Reads can overlap, so right_start < left_end - # is possible, but the reads cannot extend past each other, so right_start < left_start and - # left_end > right_end are not possible. - if options.paired_ended: - left_read_length = right_read_length = options.read_len - # Select a random fragment length from the pool. - fragment_length = options.rng.choice(fragment_pool) - else: - left_read_length = options.read_len - right_read_length = 0 - fragment_length = left_read_length - - # Booleans telling us how to deal with the various exceptions below. - skip_left = False - skip_right = False - - left_start = start_coordinate - # This trims the left read to the region of interest - left_end = min(span_length, start_coordinate + left_read_length) - - # Make right read, if paired ended, else set it equal to the default - if options.paired_ended: - # The candidate end points of the paired right read, uncorrected. It starts from the far end, so we - # know the end based on the fragment length and we count in from there. - uncorrected_right_end = start_coordinate + fragment_length - uncorrected_right_start = uncorrected_right_end - right_read_length - - # this trims the raw right read, so that right_start >= left_start and left_end <= right_end - # this keeps us from getting unrealistic paired ended reads - right_start = max(uncorrected_right_start, left_start) - right_end = min(max(uncorrected_right_end, left_end), span_length) - - # Both reads are off the map, so we throw this out: - if right_end <= 0: - # Move the start coordinate slightly to see if this keeps things moving. - start_coordinate = min( - max(start_coordinate + options.rng.integers(-10, 10), 0), - span_length - fragment_length - 1 - ) - k += 1 - if k >= 1000: - _LOG.debug(f"skipped {k} times") - continue - - else: - skip_right = True - right_start = 0 - right_end = 0 - - # Cases that might make us want to skip these reads: - - # Case 1: Truncated left read, unusable - if left_end - left_start < left_read_length: - # right read is usable if it's not off the map and is at least a read length long - if right_start < span_length and \ - (right_end - right_start >= right_read_length): - # skip the left read, keep the right read, indicating an unpaired right read - skip_left = True - # else, skip both - else: - skip_left = skip_right = True - - # Case 2: Truncated right read, unusable - if right_end - right_start < right_read_length: - # left read is usable if it's not off the map and has at least half a - # read length of usable bases - if left_start < span_length and \ - (left_end - left_start >= left_read_length): - # Set the right read to (0, 0), indicating an unpaired left read - skip_right = True - # else, skip both + final_reads = set() + # precompute how many reads we want + # The numerator is the total number of base pair calls needed. + # Divide that by read length gives the number of reads needed + number_reads = ceil((span_length * options.coverage) / options.read_len) + + # We use fragments to model the DNA + fragment_pool = fragment_model.generate_fragments(span_length, number_reads * 3) + + # step 1: Divide the span up into segments drawn froam the fragment pool. Assign reads based on that. + # step 2: repeat above until number of reads exceeds number_reads * 1.5 + # step 3: shuffle pool, then draw number_reads (or number_reads/2 for paired ended) reads to be our reads + read_count = 0 + while read_count <= number_reads: + start = 0 + temp_fragments = [] + # Breaking the gename into fragments + while start < span_length: + # We take the first element and put it back on the end to create an endless pool of fragments to draw from + fragment = fragment_pool.pop(0) + end = min(start + fragment, span_length) + # these are equivalent of reads we expect the machine to filter out, but we won't actually use it + if end - start < options.read_len: + # add some random flavor to try to keep it to falling into a loop + if options.rng.normal() < 0.5: + fragment_pool.insert(fragment, len(fragment_pool)//2) else: - skip_left = skip_right = True - - if skip_left and skip_right: - # If neither read is usable, we'll just move to the next - # Move the start coordinate slightly to see if this keeps things moving. - start_coordinate = min( - max(start_coordinate + options.rng.integers(-10, 10), 0), - span_length - fragment_length - 1 - ) - k += 1 - if k >= 1000: - _LOG.debug(f"skipped {k} times") - continue - - if not skip_left: - temp_left_read = (left_start, left_end) + fragment_pool.insert(fragment, len(fragment_pool) - 3) else: - temp_left_read = (0, 0) - - if not skip_right: - temp_right_read = (right_start, right_end) + fragment_pool.append(fragment) + temp_fragments.append((start, end)) + start = end + + # Generating reads from fragments + for fragment in temp_fragments: + read_start = fragment[0] + read_end = read_start + options.read_len + # This filters out those small fragments, to give the dataset some realistic variety + if read_end > fragment[1]: + continue else: - temp_right_read = (0, 0) - - read_to_add = temp_left_read + temp_right_read - temp_reads.append(read_to_add) - read_count += 1 - # bump it a bit to give it a little variety. - start_coordinate = min( - max(start_coordinate + options.rng.integers(-10, 10), 0), - span_length - fragment_length - 1 - ) - - # Subset this set of temporary reads to make sure we don't overdo it. - reads_to_add, final_coverage = final_subsetting( - candidate_reads=temp_reads, - target=subsection_target, - coverage_vector=final_coverage, - coordinates=section_id, - rng=options.rng - ) - final_reads.extend(reads_to_add) - - return final_reads - - -def final_subsetting( - candidate_reads: list, - target: int, - coverage_vector: list, - coordinates: tuple, - rng: Generator -) -> (list, list): - """ - The final subsetting for reads. We basically just calculate if we need to subset, then how many final reads we - should have, and we draw them randomly without replacement until we have enough. - - :param candidate_reads: The list of reads in the following format: (left_start, left_end, right_start, right_end) - :param target: The target coverage value for this section - :param coverage_vector: The coverage vector, will be updated with new counts. - :param coordinates: The indices of interest. - :param rng: The random number generator for the run - :return: Two lists, left and right reads, culled to proper coverage depth. - """ - - # Throw out fully empty reads. - filtered_candidates = [] - for read in candidate_reads: - if not any(read): - continue - elif read.count('N') > len(read)/2: - continue - else: - filtered_candidates.append(read) - - # if we're under target, we want to make up the difference. - # grab however many we need. - ret_set = rng.choice(filtered_candidates, size=target, replace=False) - - # Update the coverage vector - for i in range(coordinates[0], coordinates[1]): - coverage_vector[i] += target - - return ret_set, coverage_vector + read1 = (read_start, read_end) + if options.paired_ended: + # This will be valid because of the check above + read2 = (fragment[1] - options.read_len, fragment[1]) + else: + read2 = (0, 0) + # The structure for these reads will be (left_start, left_end, right_start, right_end) + # where start and end are ints with end > start. Reads can overlap, so right_start < left_end + # is possible, but the reads cannot extend past each other, so right_start < left_start and + # left_end > right_end are not possible. + read = read1 + read2 + if read not in final_reads: + final_reads.add(read) + read_count += 1 + + # Convert set to final list + final_reads = list(final_reads) + # Now we shuffle them to add some randomness + options.rng.shuffle(final_reads) + # And only return the number we needed + if options.paired_ended: + # Since each read is actually 2 reads, we only need to return half as many. But to cheat a few extra, we scale + # that down slightly to 1.85 reads per read. This factor is arbitrary and may even be a function. But let's see + # how well this estimate works + return final_reads[:ceil(number_reads/1.85)] + else: + # Each read lacks a pair, so we need the full number of single ended reads + return final_reads[:number_reads] def find_applicable_mutations(my_read: Read, all_variants: ContigVariants) -> dict: @@ -313,48 +184,6 @@ def replace_n(segment: Seq, rng: Generator) -> Seq: return Seq(modified_segment) -def modify_target_coverage( - target_regions: list, - discard_regions: list, - coverage_vector: np.ndarray, - coverage: int, - target_shape: int): - """ - Modifies the coverage vector by applying the list of regions. For this version, areas - outside the regions have coverage adjusted by the off_target_percent - - :param target_regions: A list of intervals to target, extracted from a bed file - :param discard_regions: A list of regions to throw out, extracted from a bed file - :param coverage_vector: The target coverage vector, which will be modified - :param coverage: The target coverage value at this position - :param target_shape: The size of the bins in the final output - :return: The updated target coverage vector, binned into target_shape sized bins. - """ - - for region in target_regions: - # in the included regions, any are marked false is to be excluded. Everything else remains untouched - if not region[2]: - coverage_vector[region[0]: region[1]] = [0] * (region[1] - region[0]) - - for region in discard_regions: - # in the discard regions section, a True indicates this region falls - # within a discard bed and should be discarded. Note that this will discard regions if the discard and target - # beds overlap - if region[2]: - coverage_vector[region[0]: region[1]] = [0] * (region[1] - region[0]) - - # now for the final coverage vector, we need to bin by the window bias in the gc-bias model - start = 0 - final_coverage_vector = [] - for i in range(target_shape, len(coverage_vector), target_shape): - subsection = coverage_vector[start: i] - subsection_average = round(sum(subsection)/len(subsection)) # round() with no second argument returns an int - final_coverage_vector.append(subsection_average) - start = i - - return final_coverage_vector - - def merge_sort(my_array: np.ndarray): """ This sorts the reads in reverse position, merging as it goes, in order to get the proper order @@ -411,60 +240,30 @@ def generate_reads(reference: SeqRecord, start = time.time() base_name = f'{Path(options.output).name}-{chrom}' - # Assume uniform coverage - target_coverage_vector = np.full(shape=len(reference), fill_value=options.coverage) - if not options.no_coverage_bias: - pass - # I'm trying to move this section into cover_dataset. - # target_coverage_vector = gc_bias.create_coverage_bias_vector(target_coverage_vector, reference.seq.upper()) - - # We want to bin the coverage into window-sized segments to speed up calculations. - # This divides the segment into len(reference) // window_size (integer division). - target_shape = len(reference) // gc_bias.window_size - - # Apply the targeting/discarded rates. - target_coverage_vector = modify_target_coverage( - targeted_regions, discarded_regions, target_coverage_vector, options.coverage, target_shape - ) _LOG.debug("Covering dataset.") t = time.process_time() reads = cover_dataset( len(reference), - target_coverage_vector, options, - gc_bias, fraglen_model, ) - _LOG.debug(f"Data set coverage took: {time.process_time() - t}") - - # Reads that are paired - paired_reads = np.asarray([tuple(x) for x in reads if any(x[0:2]) and any(x[2:4])]) - if paired_reads.size: - paired_reads = merge_sort(paired_reads) + _LOG.debug(f"Dataset coverage took: {time.process_time() - t}") - # singletons - if paired_reads.any(): - singletons = np.asarray([tuple(x) for x in reads if x not in paired_reads and any(x)]) - else: - singletons = np.asarray([tuple(x) for x in reads if any(x)]) - if singletons.size: - singletons = merge_sort(singletons) - - # determine sam read order. It should be paired reads, then singletons, unless one or the other is missing. - if paired_reads.size and singletons.size: - sam_read_order = np.concatenate((paired_reads, singletons)) - elif paired_reads.size: - # if singletons is empty - sam_read_order = paired_reads - else: - # if there are no paired reads - sam_read_order = singletons + # Filters left to apply: N filter... not sure yet what to do with those (output random low quality reads, maybe) + # Target regions + # discard regions + # GC-bias figure out some way to throw out 75% of reads with high GC or AT content (see original gc bias model). + # heterozygous variants. Okay, so I've been applying the variant to all but for heterozygous we need to apply the + # variant to only some + paired_reads = [] + singletons = [] final_sam_dict = {} - if options.paired_ended: - _LOG.debug(f"Paired percentage = {len(paired_reads)/len(sam_read_order)}") + # This is the orderd the reads will be in when sorted in the final sam file, if we're doing that. They are + # easier to sort now as sets of coordinates than they will be later as lines in a file. + sam_dict_order = sorted(reads, key=lambda element: (element[0:])) _LOG.debug("Writing fastq(s) and optional tsam, if indicated") t = time.process_time() @@ -561,5 +360,8 @@ def generate_reads(reference: SeqRecord, with open_output(reads_pickle) as reads: pickle.dump(final_sam_dict, reads) + if options.paired_ended: + _LOG.debug(f"Paired percentage = {len(paired_reads)/len(sam_read_order)}") + _LOG.info(f"Finished sampling reads in {time.time() - start} seconds") return chrom_fastq_r1, chrom_fastq_r2 diff --git a/neat/read_simulator/utils/options.py b/neat/read_simulator/utils/options.py index de9cab34..8ad67f12 100644 --- a/neat/read_simulator/utils/options.py +++ b/neat/read_simulator/utils/options.py @@ -299,18 +299,23 @@ def log_configuration(self): _LOG.info(f"Single threading - 1 thread.") # We'll work on multithreading later... # _LOG.info(f'Multithreading - {options.threads} threads') + if self.fragment_mean: + if self.fragment_st_dev: + _LOG.info(f'Generating fragments based on mean={self.fragment_mean}, ' + f'stand. dev={self.fragment_st_dev}') + else: + raise ValueError("Provided fragment mean, but no fragment standard deviation!") if self.paired_ended: _LOG.info(f'Running in paired-ended mode.') if self.fragment_model: _LOG.info(f'Using fragment length model: {self.fragment_model}') - elif self.fragment_mean and self.fragment_st_dev: - _LOG.info(f'Generating fragment model based on mean={self.fragment_mean}, ' - f'st dev={self.fragment_st_dev}') + elif self.fragment_mean: + pass # Already addressed this above else: raise ValueError("Paired ended mode requires either a fragment model or a mean and standard deviation.") - else: _LOG.info(f'Running in single-ended mode.') + _LOG.info(f'Using a read length of {self.read_len}') _LOG.info(f'Average coverage: {self.coverage}') if self.error_model: diff --git a/tests/test_read_simulator/test_cover_dataset.py b/tests/test_read_simulator/test_generate_reads.py similarity index 73% rename from tests/test_read_simulator/test_cover_dataset.py rename to tests/test_read_simulator/test_generate_reads.py index c8193c1d..cbe1f996 100644 --- a/tests/test_read_simulator/test_cover_dataset.py +++ b/tests/test_read_simulator/test_generate_reads.py @@ -1,65 +1,69 @@ +from math import ceil + import pytest import numpy as np -from neat.models import FragmentLengthModel -from neat.read_simulator.utils import Options, cover_dataset +from neat.models import FragmentLengthModel, GcModel +from neat.read_simulator.utils import Options +from neat.read_simulator.utils.generate_reads import * def test_cover_dataset(): """Test that a cover is successfully generated for different coverage values""" - read_pool = [10] * 2000 span_length = 100 - target_vector = np.full(100, fill_value=10, dtype=int) options = Options(rng_seed=0) - options.read_len = 101 - options.paired_ended = True - options.fragment_mean = 250 - options.fragment_st_dev = 100 - options.output.overwrite_output = True - fragment_model = FragmentLengthModel(rng=options.rng) + options.read_len = 10 + options.paired_ended = False + options.overwrite_output = True + fragment_model = FragmentLengthModel(15, 5, rng=options.rng) - coverage_values = [1, 2, 5, 10, 25, 50] + coverage_values = [1, 2, 5] for coverage in coverage_values: options.coverage = coverage - read1, read2 = cover_dataset(read_pool, span_length, target_vector, options, fragment_model) + reads = cover_dataset(span_length, options, fragment_model) coverage_check = [] for i in range(span_length): # single ended test, only need read1 - cover = [x for x in read1 if i in range(x[0], x[1])] + cover = [x for x in reads if i in range(x[0], x[1])] coverage_check.append(len(cover)) - assert sum(coverage_check)/len(coverage_check) > coverage, f"Coverage check failed for coverage {coverage}" + print(f"\nRequested Coverage: {coverage}, Average Coverage: {sum(coverage_check) / len(coverage_check)}") + assert sum(coverage_check)/len(coverage_check) > 0.9 * coverage, f"Coverage check failed for coverage {coverage}" + for read in reads: + if read[1] - read[0] < 10: + raise AssertionError("failed to filter out a small read length") + assert len(reads) >= (100 * coverage)/10 def test_paired_cover_dataset(): """Test that a cover is successfully generated for different coverage values""" - read_pool = [10] * 2000 span_length = 100 - target_vector = np.full(100, fill_value=10, dtype=int) options = Options(rng_seed=0) - options.read_len = 101 + options.read_len = 10 options.paired_ended = True - options.fragment_mean = 250 - options.fragment_st_dev = 100 - options.output.overwrite_output = True - fragment_model = FragmentLengthModel(fragment_mean=20, fragment_std=2, fragment_min=10, fragment_max=30, rng=options.rng) + options.overwrite_output = True + fragment_model = FragmentLengthModel(25, 10, rng=options.rng) - coverage_values = [1, 2, 5, 10, 25, 50] + coverage_values = [1, 2, 5] for coverage in coverage_values: options.coverage = coverage - read1, read2 = cover_dataset(read_pool, span_length, target_vector, options, fragment_model) + reads = cover_dataset(span_length, options, fragment_model) coverage_check = [] for i in range(span_length): # paired ended test, need both read1 and read2 - cover = [x for x in read1 + read2 if i in range(x[0], x[1])] + cover = [x for x in reads if (i in range(x[0], x[1]) or i in range(x[2], x[3]))] coverage_check.append(len(cover)) - assert sum(coverage_check) / len(coverage_check) > coverage, f"Coverage check failed for coverage {coverage}" + print(f"\nRequested Coverage: {coverage}, Average Coverage: {sum(coverage_check) / len(coverage_check)}") + assert sum(coverage_check) / len(coverage_check) > 0.9 * coverage, \ + f"Coverage check failed for coverage {coverage}" + for read in reads: + if read[1] - read[0] < 10 or read[3] - read[2] < 10: + raise AssertionError("failed to filter out a small read length") + assert len(reads) >= (100 * coverage)/20 def test_various_read_lengths(): """Test cover_dataset with various read lengths to ensure no errors""" - read_pool = [10] * 2000 - span_length = 100 - target_vector = np.full(100, fill_value=10, dtype=int) + span_length = 1000 options = Options(rng_seed=0) options.paired_ended = True options.coverage = 10 @@ -71,7 +75,7 @@ def test_various_read_lengths(): for read_len in range(10, 251, 10): options.read_len = read_len try: - read1, _ = cover_dataset(read_pool, span_length, target_vector, options, fragment_model) + read1, _ = cover_dataset(span_length, options, fragment_model) except Exception as e: pytest.fail(f"Test failed for read_len={read_len} with exception: {e}") @@ -100,6 +104,7 @@ def test_fragment_mean_st_dev_combinations(): except Exception as e: pytest.fail(f"Test failed for mean={mean}, st_dev={st_dev} with exception: {e}") + def test_coverage_ploidy_combinations(): """Test cover_dataset with various combinations of coverage and ploidy values to ensure no errors""" read_pool = [10] * 2000 From 19770defc8f7445f90a5192507ec266e1ddeaf2c Mon Sep 17 00:00:00 2001 From: Joshua Allen Date: Mon, 20 May 2024 01:10:55 -0500 Subject: [PATCH 2/8] Still working through updates to coverage, properly implementing the heterozygosity. --- neat/common/ploid_functions.py | 26 +- neat/read_simulator/runner.py | 4 +- neat/read_simulator/utils/generate_reads.py | 266 +++++++++++------- .../read_simulator/utils/local_file_writer.py | 1 + .../utils/output_file_writer.py | 4 +- neat/read_simulator/utils/read.py | 172 ++++++----- .../test_generate_reads.py | 23 ++ 7 files changed, 302 insertions(+), 194 deletions(-) diff --git a/neat/common/ploid_functions.py b/neat/common/ploid_functions.py index d45682cc..1046c52d 100644 --- a/neat/common/ploid_functions.py +++ b/neat/common/ploid_functions.py @@ -28,24 +28,18 @@ def pick_ploids(ploidy: int, :return: a list of strings representing the genotype of each ploid. """ # number of ploids to make this mutation on (always at least 1) + how_many = 1 if rng.random() < homozygous_frequency: - if ploidy <= 2: - # If it's homozygous with a ploidy of 2 it's on both. - how_many = ploidy - else: - # if it's polyploid, it's on more than one ploid - # TODO may need to improve the modeling for polyploid - how_many = 1 - for i in range(ploidy): - # Not totally sure how to model this, so I'm counting each - # ploid as a separate homozygous event. That doesn't exactly make - # sense though, so we'll improve this later. - if rng.random() < homozygous_frequency: - how_many += 1 - else: - break + # We'll consider this one to be homozygous + how_many = ploidy else: - how_many = 1 + if ploidy == 1: + # special case where heteroyzgous makes no sense + how_many = 1 + else: + # if it's polyploid, we'll consider it to be on roughly half the ploids + # TODO may need to improve the modeling for polyploid, maybe + how_many = ploidy//2 # wp is just the temporary genotype list, a hat tip to the old version of NEAT wp = np.zeros(ploidy) diff --git a/neat/read_simulator/runner.py b/neat/read_simulator/runner.py index 2e9f4965..8ff25b96 100644 --- a/neat/read_simulator/runner.py +++ b/neat/read_simulator/runner.py @@ -175,6 +175,7 @@ def read_simulator_runner(config: str, output: str): """ _LOG.info(f'Reading {options.reference}.') + # TODO check into SeqIO.index_db() reference_index = SeqIO.index(str(options.reference), "fasta") _LOG.debug("Reference file indexed.") @@ -301,7 +302,7 @@ def read_simulator_runner(config: str, output: str): pass if options.paired_ended: - max_qual_score = max(seq_error_model_1.quality_scores + seq_error_model_2.quality_scores) + max_qual_score = max(seq_error_model_1.quality_scores, seq_error_model_2.quality_scores) else: max_qual_score = max(seq_error_model_1.quality_scores) @@ -335,6 +336,7 @@ def read_simulator_runner(config: str, output: str): local_bam_pickle_file, seq_error_model_1, seq_error_model_2, + mut_model, gc_bias_model, fraglen_model, local_variants, diff --git a/neat/read_simulator/utils/generate_reads.py b/neat/read_simulator/utils/generate_reads.py index 7a4bc699..73976665 100644 --- a/neat/read_simulator/utils/generate_reads.py +++ b/neat/read_simulator/utils/generate_reads.py @@ -10,7 +10,7 @@ from numpy.random import Generator from bisect import bisect_left, bisect_right -from ...models import SequencingErrorModel, GcModel, FragmentLengthModel +from ...models import SequencingErrorModel, GcModel, FragmentLengthModel, MutationModel from .options import Options from ...common import open_input, open_output, ALLOWED_NUCL from ...variants import ContigVariants @@ -19,24 +19,12 @@ __all__ = [ 'generate_reads', 'cover_dataset', + 'overlaps', ] _LOG = logging.getLogger(__name__) -def is_too_many_n(segment): - """ - Checks that the segment is valid and there aren't too many invalid characters in it. - - :param segment: the sequence to check - :return: True if there are too many invalid characters, or no reads; False otherwise - """ - if not segment: - return True - n = segment.upper().count('N') - return n / len(segment) >= 0.2 - - """ Currently this code block is unused, but I think it might be useful for parallelization, so I wanted to leave it in. @@ -198,10 +186,33 @@ def merge_sort(my_array: np.ndarray): return ret_array +def overlaps(test_interval: tuple[int, int], comparison_interval: tuple[int, int]) -> bool: + """ + This function checks if the read overlaps with an input interval. + :param test_interval: the interval to test, expressed as a tuple of end points + (understood to be a half-open interval) + :param comparison_interval: the interval to check against, expressed as a tuple of end points + + Four situations where we can say there is an overlap: + 1. The comparison interval contains the test interval start point + 2. The comparison interval contains the test interval end point + 3. The comparison interval contains both start and end points of the test interval + 4. The comparison interval is within the test interval + Although 3 is really just a special case of 1 and 2, so we don't need a separate check + + If the read is equal to the interval, then all of these will be trivially true, + and we don't need a separate check. + """ + return (comparison_interval[0] < test_interval[1] < comparison_interval[1]) or \ + (comparison_interval[0] <= test_interval[0] < comparison_interval[1]) or \ + (test_interval[0] <= comparison_interval[0] and test_interval[1] >= comparison_interval[1]) + + def generate_reads(reference: SeqRecord, reads_pickle: str, error_model_1: SequencingErrorModel, error_model_2: SequencingErrorModel | None, + mutation_model: MutationModel, gc_bias: GcModel, fraglen_model: FragmentLengthModel, contig_variants: ContigVariants, @@ -219,6 +230,7 @@ def generate_reads(reference: SeqRecord, :param reads_pickle: The file to put the reads generated into, for bam creation. :param error_model_1: The error model for this run, the forward strand :param error_model_2: The error model for this run, reverse strand + :param mutation_model: The mutation model for this run :param gc_bias: The GC-Bias model for this run :param fraglen_model: The fragment length model for this run :param contig_variants: An object containing all input and randomly generated variants to be included. @@ -257,13 +269,9 @@ def generate_reads(reference: SeqRecord, # heterozygous variants. Okay, so I've been applying the variant to all but for heterozygous we need to apply the # variant to only some - paired_reads = [] + # These will hold the values as inserted. + properly_paired_reads = [] singletons = [] - final_sam_dict = {} - if options.paired_ended: - # This is the orderd the reads will be in when sorted in the final sam file, if we're doing that. They are - # easier to sort now as sets of coordinates than they will be later as lines in a file. - sam_dict_order = sorted(reads, key=lambda element: (element[0:])) _LOG.debug("Writing fastq(s) and optional tsam, if indicated") t = time.process_time() @@ -274,94 +282,150 @@ def generate_reads(reference: SeqRecord, for i in range(len(reads)): print(f'{i/len(reads):.2%}', end='\r') - # Added some padding after, in case there are deletions - segments = [reference[reads[i][0]: reads[i][1] + 50].seq.upper(), - reference[reads[i][2]: reads[i][3] + 50].seq.upper()] - # Check for N concentration - # Make sure the segments will come out to at around 80% valid bases - # So as not to mess up the calculations, we will skip the padding we added above. - if not is_too_many_n(segments[0][:-50] + segments[1][:-50]): - read_name = f'{base_name}-{str(i)}' - - if options.produce_bam: - # This index gives the position of the read for the final bam file - # [0][0] gives us the first index where this occurs (which should be the only one) - final_order_read_index = np.where(sam_read_order == reads[i])[0][0] - if final_order_read_index not in final_sam_dict: - final_sam_dict[final_order_read_index] = [] - - # reads[i] = (left_start, left_end, right_start, right_end) - # If there is a read one: - if any(reads[i][0:2]): - segment = segments[0] - segment = replace_n(segment, options.rng) - - if any(reads[i][2:4]): - is_paired = True - else: - is_paired = False - - read1 = Read(name=read_name + "/1", - raw_read=reads[i], - reference_segment=segment, - reference_id=reference.id, - position=reads[i][0] + ref_start, - end_point=reads[i][1] + ref_start, - quality_offset=options.quality_offset, - is_paired=is_paired - ) - - read1.mutations = find_applicable_mutations(read1, contig_variants) - - read1.finalize_read_and_write(error_model_1, fq1, options.produce_fastq) - - if options.produce_bam: - # Save this for later - final_sam_dict[final_order_read_index].append(read1) - - elif options.produce_bam: - # If there's no read1, append a 0 placeholder - final_sam_dict[final_order_read_index].append(0) - - if any(reads[i][2:4]): - segment = segments[1] - segment = replace_n(segment, options.rng) - - if any(reads[i][0:2]): - is_paired = True - else: - is_paired = False - - read2 = Read(name=read_name + "/2", - raw_read=reads[i], - reference_segment=segment, - reference_id=reference.id, - position=reads[i][2] + ref_start, - end_point=reads[i][3] + ref_start, - quality_offset=options.quality_offset, - is_reverse=True, - is_paired=is_paired - ) - - read2.mutations = find_applicable_mutations(read2, contig_variants) - read2.finalize_read_and_write(error_model_2, fq2, options.produce_fastq) - - if options.produce_bam: - # Save this for later - final_sam_dict[final_order_read_index].append(read2) - - elif options.produce_bam: - # If there's no read2, append a 0 placeholder - final_sam_dict[final_order_read_index].append(0) + # First thing we'll do is check to see if this read is filtered out by a bed file + read1, read2 = (reads[i][0], reads[i][1]), (reads[i][2], reads[i][3]) + found_read1, found_read2 = False, False + # For no target bed, there wil only be one region to check and this will complete very quickly + for region in targeted_regions: + # If this is a false region, we can skip it + if not region[2]: + continue + # We need to make sure this hasn't been filtered already, so if any coordinate is nonzero (falsey) + if any(read1): + # Check if read1 is in this targeted region (any part of it overlaps) + if overlaps(read1, (region[0], region[1])): + found_read1 = True + # Again, make sure it hasn't been filtered, or this is a single-ended read + elif any(read2): + if overlaps(read2, (region[0], region[1])): + found_read2 = True + # This read was outside of targeted regions + if not found_read1: + # Filter out this read + read1 = (0, 0) + if not found_read2: + # Note that for single ended reads, it will never find read2 and this does nothing (it's already (0,0)) + read2 = (0, 0) + + # If there was no discard bed, this will complete very quickly + discard_read1, discard_read2 = False, False + for region in discarded_regions: + # If region[2] is False then this region is not being discarded and we can skip it + if not region[2]: + continue + # Check to make sure the read isn't already filtered out + if any(read1): + if overlaps(read1, (region[0], region[1])): + discard_read1 = True + # No need to worry about already filtered reads + if any(read2): + if overlaps(read2, (region[0], region[1])): + discard_read2 = True + if discard_read1: + read1 = (0, 0) + if discard_read2: + read2 = (0, 0) + + # This raw read will replace the original reads[i], and is the raw read with filters applied. + raw_read = read1 + read2 + + # If both reads were filtered out, we can move along + if not any(raw_read): + continue + else: + # We must have at least 1 read that has data + # If only read1 or read2 is absent, this is a singleton + read1_is_singleton = False + read2_is_singleton = False + if not any(read2): + # Note that this includes all single ended reads that passed filter + read1_is_singleton = True + singletons.append(raw_read) + elif not any(read1): + read2_is_singleton = True + singletons.append(raw_read) + else: + properly_paired_reads.append(raw_read) + + read_name = f'{base_name}-{str(i)}' + + # If the other read is marked as a singleton, then this one was filtered out, or these are single-ended + if not read2_is_singleton: + # It's properly paired if it's not a singleton + is_properly_paired = not read1_is_singleton + # add a small amount of padding to the end to account for deletions + # 30 is basically arbitrary. This may be a function of read length or determinable? + # Though we don't figure out the variants until later + segment = reference[read1[0]: read1[1] + 30] + + read1 = Read(name=read_name + "/1", + raw_read=raw_read, + reference_segment=segment, + reference_id=reference.id, + position=read1[0] + ref_start, + end_point=read1[1] + ref_start, + quality_offset=options.quality_offset, + is_paired=is_properly_paired + ) + + read1.mutations = find_applicable_mutations(read1, contig_variants) + read1.finalize_read_and_write(error_model_1, mutation_model, fq1, options.produce_fastq) + + # if read1 is a sinleton then these are single-ended reads or this one was filtered out, se we skip + if not read1_is_singleton: + is_properly_paired = not read2_is_singleton + # Because this segment reads back to front, we need padding at the beginning. + # 30 is arbitrary. This may be a function of read length or determinable? + # Though we don't figure out the variants until later + segment = reference[read2[0] - 30: read2[1]] + + read2 = Read(name=read_name + "/2", + raw_read=reads[i], + reference_segment=segment, + reference_id=reference.id, + position=read2[0] + ref_start, + end_point=read2[1] + ref_start, + quality_offset=options.quality_offset, + is_reverse=True, + is_paired=is_properly_paired + ) + + read2.mutations = find_applicable_mutations(read2, contig_variants) + read2.finalize_read_and_write(error_model_2, fq2, options.produce_fastq) _LOG.debug(f"Fastqs written in: {time.process_time() - t} s") + #TODO figure this bit out + if options.produce_bam: + # Save this for later + raw_sam_dict[final_order_read_index].append((read1, read1_is_singleton)) + + if options.produce_bam: + # Save this for later + raw_sam_dict[final_order_read_index].append((read2, read2_is_singleton)) + + # This is the order of the reads as they + sam_dict_order = [] + raw_sam_dict = {} + if options.paired_ended: + # This is the ordered the reads will be in when sorted in the final sam file, if we're doing that. They are + # easier to sort now as sets of coordinates than they will be later as lines in a file. + sam_dict_order = sorted(reads, key=lambda element: (element[0:])) + + if options.produce_bam: + # This index gives the position of the read for the final bam file + # [0][0] gives us the first index where this occurs (which should be the only one) + final_order_read_index = np.where(sam_dict_order == reads[i])[0][0] + if final_order_read_index not in raw_sam_dict: + raw_sam_dict[final_order_read_index] = [] + + if options.produce_bam: with open_output(reads_pickle) as reads: - pickle.dump(final_sam_dict, reads) + pickle.dump(raw_sam_dict, reads) if options.paired_ended: - _LOG.debug(f"Paired percentage = {len(paired_reads)/len(sam_read_order)}") + _LOG.debug(f"Paired percentage = {len(reads)/len(sam_dict_order)}") _LOG.info(f"Finished sampling reads in {time.time() - start} seconds") return chrom_fastq_r1, chrom_fastq_r2 diff --git a/neat/read_simulator/utils/local_file_writer.py b/neat/read_simulator/utils/local_file_writer.py index 607f7fdb..a291d4c3 100644 --- a/neat/read_simulator/utils/local_file_writer.py +++ b/neat/read_simulator/utils/local_file_writer.py @@ -47,6 +47,7 @@ def write_local_file(vcf_filename: str or Path, for loc in variant_data.variant_locations: for variant in variant_data[loc]: if not variant.is_input: + # Todo make sure this section still works target_region = find_region(loc, targeted_regions) # This will be True in targeted regions, if a bed is present, or everywhere if not bed is present. # Anything outside defined target regions will be marked false and this `if` will activate. diff --git a/neat/read_simulator/utils/output_file_writer.py b/neat/read_simulator/utils/output_file_writer.py index 244e225a..c065ea46 100755 --- a/neat/read_simulator/utils/output_file_writer.py +++ b/neat/read_simulator/utils/output_file_writer.py @@ -340,7 +340,7 @@ def write_bam_record(self, read: Read, contig_id: int, bam_handle: bgzf.BgzfWrit SEQ_PACKED[alt_sequence[2 * i + 1].capitalize()])) # apparently samtools automatically adds 33 to the quality score string... - encoded_qual = ''.join([chr(ord(n) - 33) for n in read.read_quality_score]) + encoded_qual = ''.join([chr(ord(n) - 33) for n in read.read_quality_string]) """ block_size = 4 + # refID int32 @@ -357,7 +357,7 @@ def write_bam_record(self, read: Read, contig_id: int, bam_handle: bgzf.BgzfWrit len(seq) """ - block_size = 32 + len(read.name) + 1 + len(encoded_cig) + len(encoded_seq) + len(read.read_quality_score) + block_size = 32 + len(read.name) + 1 + len(encoded_cig) + len(encoded_seq) + len(read.read_quality_string) bam_handle.write((pack(' 1: # Generate extra scores for insertions - new_quality_score.extend(model.rng.choice(low_scores, size=len(alternate)-1)) + new_quality_score.extend(rng.choice(low_scores, size=len(alternate)-1)) # If is deletion elif len(reference) > 1 and len(alternate) == 1: new_quality_score = new_quality_score[:1] # SNP else: - new_quality_score = model.rng.choice(low_scores, size=1) + new_quality_score = rng.choice(low_scores, size=1) # Replace the given quality score with the new one self.quality_array = \ @@ -156,13 +161,6 @@ def update_quality_array( np.array(new_quality_score), self.quality_array[location+len(reference):])) - if len(self.quality_array) < self.length: - # Just in case we need to fill out the quality score array - temp_list = self.quality_array.tolist() - self.quality_array = np.array(temp_list.append( - model.rng.choice(model.quality_scores, size=self.length-len(self.quality_array)) - )) - def apply_errors(self, mutated_sequence, err_model): """ This function applies errors to a sequence and calls the update_quality_array function after @@ -176,68 +174,89 @@ def apply_errors(self, mutated_sequence, err_model): mutated_sequence = \ mutated_sequence[:error.location] + error.alt + mutated_sequence[error.location+len(error.ref):] # update quality score for error - self.update_quality_array(error.ref, error.alt, error.location, "error", err_model) + self.update_quality_array( + error.ref, + error.alt, + error.location, + "error", + err_model.quality_scores, + err_model.rng + ) return mutated_sequence - def apply_mutations(self, mutated_sequence, err_model): + def apply_mutations(self, mutated_sequence: Seq, quality_scores: list, mut_model: MutationModel): """ Applying mutations involves one extra step, because of polyploidism. There may be more than one mutation at a given location, so it is formulated as a list. We then pick one at random for this read. :param mutated_sequence: The sequence we are applying the variant to - :param err_model: The error_model for the run, used for the quality score and for the rng + :param quality_scores: The possible quality scores for this run (to update quality scores) + :param mut_model: The mutation model for the run, used for the zygosity and for the rng :return: mutated sequence, with mutations applied """ - # Start at the right position based on if this is the forward or reverse read. - segment_start = int(self.raw_read[2]) if self.is_reverse else int(self.raw_read[0]) for location in self.mutations: - # There may be more than one variant to apply, because of polyploidism. - # So we'll pick one at random for this read, each with equal probability. - # We may want to add a model to this. - variant_to_apply = err_model.rng.choice(self.mutations[location]) + variant_to_apply = mut_model.rng.choice(self.mutations[location]) + # Fetch parameters qual_score = variant_to_apply.get_qual_score() - position = variant_to_apply.get_0_location() - segment_start - if type(variant_to_apply) == Insertion or type(variant_to_apply) == SingleNucleotideVariant: - reference_length = 1 - alternate = variant_to_apply.get_alt() - elif type(variant_to_apply) == Deletion: - reference_length = variant_to_apply.length - alternate = mutated_sequence[position] - else: - reference_length = variant_to_apply.get_ref_len() - alternate = mutated_sequence.get_alt() - - # Replace the entire ref with the entire alt - mutated_sequence = \ - mutated_sequence[:position] + alternate + mutated_sequence[position + reference_length:] - - self.update_quality_array( - self.reference_segment[location: location+reference_length], alternate, location, "mutation", err_model, qual_score - ) + # Find the position within the read for this variant + position = variant_to_apply.get_0_location() - self.position + # Figure out if the variant is in this read. If 'to_mutate' selects any 1, then it is mutated. + to_mutate = mut_model.rng.choice(variant_to_apply.genotype) + # If a 1 was selected, then apply the variant, else return the original sequence + if to_mutate: + if type(variant_to_apply) == Insertion or type(variant_to_apply) == SingleNucleotideVariant: + reference_length = 1 + alternate = variant_to_apply.get_alt() + elif type(variant_to_apply) == Deletion: + reference_length = variant_to_apply.length + alternate = mutated_sequence[position] + else: + reference_length = variant_to_apply.get_ref_len() + alternate = mutated_sequence.get_alt() + + # Replace the entire ref with the entire alt + mutated_sequence = mutated_sequence[:position] + alternate + mutated_sequence[position + reference_length:] + + self.update_quality_array( + self.reference_segment[location: location+reference_length], + alternate, + location, + "mutation", + quality_scores, + mut_model.rng, + qual_score + ) return mutated_sequence - def get_read(self, error_model) -> Seq: + def apply_variants_for_final_output( + self, + error_model: SequencingErrorModel, + mutation_model: MutationModel + ) -> Seq: """ Gets mutated sequence to output for fastq/bam :param error_model: The error model for the run + :param mutation_model: The mutation model for the run :return: the mutated sequence """ - mutated_sequence = self.reference_segment + # TODO Add filtering for bed files and heterozygosity right here + # The read sequence should have some padding still at this point + read_sequence = self.read_sequence.seq if self.mutations: - mutated_sequence = self.apply_mutations(mutated_sequence, error_model) + read_sequence = self.apply_mutations(read_sequence, list(error_model.quality_scores), mutation_model) if self.errors: - mutated_sequence = self.apply_errors(mutated_sequence, error_model) + read_sequence = self.apply_errors(read_sequence, error_model) - mutated_sequence = mutated_sequence[:self.length] + read_sequence = read_sequence[:self.length] self.quality_array = self.quality_array[:self.length] - return Seq(mutated_sequence) + return Seq(read_sequence) def contains(self, test_pos: int): return self.position <= test_pos < self.end_point @@ -275,6 +294,7 @@ def calculate_flags(self, paired_ended_run): def finalize_read_and_write( self, err_model: SequencingErrorModel, + mut_model: MutationModel, fastq_handle: TextIO, produce_fastq: bool, ): @@ -282,6 +302,7 @@ def finalize_read_and_write( Writes the record to the temporary fastq file :param err_model: The error model for the run + :param mut_model: The mutation model for the run :param fastq_handle: the path to the fastq model to write the read :param produce_fastq: If true, this will write out the temp fastqs. If false, this will only write out the tsams to create the bam files. @@ -290,28 +311,31 @@ def finalize_read_and_write( # Generate quality scores to cover the extended segment. We'll trim later self.quality_array = err_model.get_quality_scores(len(self.reference_segment)) - # Get errors for the read - self.errors = err_model.get_sequencing_errors(self.length, self.reference_segment, - self.quality_array) - - read_sequence = self.get_read(err_model) - if self.is_reverse: - self.quality_array = self.quality_array[::-1] - read_sequence = read_sequence.reverse_complement() - temp_position = self.position - self.position = len(self.reference_segment) - temp_position - self.length - self.end_point = len(self.reference_segment) - temp_position + read_quality_array = self.quality_array[::-1] + read_sequence = self.reference_segment.reverse_complement() + else: + read_quality_array = self.quality_array + read_sequence = self.reference_segment + + # Get errors for the read and update the quality score + self.errors = err_model.get_sequencing_errors(self.length, read_sequence, + read_quality_array) + # Update the read as needed: + self.quality_array = read_quality_array self.read_sequence = read_sequence - self.read_quality_score = "".join([chr(x + self.quality_offset) for x in self.quality_array]) + + self.read_sequence = self.apply_variants_for_final_output(err_model, mut_model) + + self.read_quality_string = "".join([chr(x + self.quality_offset) for x in self.quality_array]) self.mapping_quality = err_model.rng.poisson(70) if produce_fastq: fastq_handle.write(f'@{self.name}\n') fastq_handle.write(f'{str(self.read_sequence)}\n') fastq_handle.write('+\n') - fastq_handle.write(f'{self.read_quality_score}\n') + fastq_handle.write(f'{self.read_quality_string}\n') def make_cigar(self): """ diff --git a/tests/test_read_simulator/test_generate_reads.py b/tests/test_read_simulator/test_generate_reads.py index cbe1f996..883668ac 100644 --- a/tests/test_read_simulator/test_generate_reads.py +++ b/tests/test_read_simulator/test_generate_reads.py @@ -158,3 +158,26 @@ def test_single_ended_mode(): assert sum(coverage_check) / len(coverage_check) > options.coverage, "Coverage check failed in single-ended mode" except Exception as e: pytest.fail(f"Test failed in single-ended mode with exception: {e}") + + +def test_overlaps(): + comparsion_interval = (120, 400) + + interval1 = (0, 151) + assert overlaps(interval1, comparsion_interval) # overlaps + interval1 = (385, 421) + assert overlaps(interval1, comparsion_interval) # overlaps + interval1 = (130, 281) + assert overlaps(interval1, comparsion_interval) # overlaps + interval1 = (120, 400) + assert overlaps(interval1, comparsion_interval) # overlaps + interval1 = (100, 500) + assert overlaps(interval1, comparsion_interval) # overlaps + interval1 = (0, 100) + assert not overlaps(interval1, comparsion_interval) # doesn't overlap + interval1 = (0, 120) + assert not overlaps(interval1, comparsion_interval) # doesn't overlap + interval1 = (400, 450) + assert not overlaps(interval1, comparsion_interval) # doesn't overlap + interval1 = (500, 700) + assert not overlaps(interval1, comparsion_interval) # doesn't overlap From 70b9dc53c9e5abc18b83144359b54dabaede079f Mon Sep 17 00:00:00 2001 From: Joshua Allen Date: Mon, 20 May 2024 23:38:34 -0500 Subject: [PATCH 3/8] new coverage working --- neat/cli/cli.py | 2 +- neat/models/models.py | 6 +- neat/read_simulator/runner.py | 2 +- neat/read_simulator/utils/generate_reads.py | 63 ++++++++------------- neat/read_simulator/utils/read.py | 43 ++++++++------ 5 files changed, 57 insertions(+), 59 deletions(-) diff --git a/neat/cli/cli.py b/neat/cli/cli.py index 3a41ddf2..826a13d7 100644 --- a/neat/cli/cli.py +++ b/neat/cli/cli.py @@ -140,7 +140,7 @@ def main(parser: argparse.ArgumentParser, arguments: list[str]) -> int: else: end = time.time() log.info( - "command finished successfully; execution took %.3f sec", end - start + f"command finished successfully; execution took {(end - start)/60:.2f} sec" ) return 0 diff --git a/neat/models/models.py b/neat/models/models.py index fef6f270..e8f37559 100644 --- a/neat/models/models.py +++ b/neat/models/models.py @@ -417,6 +417,9 @@ def get_sequencing_errors(self, del_blacklist = [] for index in error_indexes[::-1]: + # Skip any N's + if reference_segment[index] == 'N': + continue # determine error type. Most will be SNVs error_type = SingleNucleotideVariant @@ -622,5 +625,4 @@ def generate_fragments(self, """ # generates a distribution, assuming normality, then rounds the result and converts to ints dist = np.round(self.rng.normal(self.fragment_mean, self.fragment_st_dev, size=number_of_fragments)).astype(int) - - return list(dist) + return [abs(x) for x in dist] diff --git a/neat/read_simulator/runner.py b/neat/read_simulator/runner.py index 8ff25b96..cf70a4f5 100644 --- a/neat/read_simulator/runner.py +++ b/neat/read_simulator/runner.py @@ -302,7 +302,7 @@ def read_simulator_runner(config: str, output: str): pass if options.paired_ended: - max_qual_score = max(seq_error_model_1.quality_scores, seq_error_model_2.quality_scores) + max_qual_score = max(max(seq_error_model_1.quality_scores), max(seq_error_model_2.quality_scores)) else: max_qual_score = max(seq_error_model_1.quality_scores) diff --git a/neat/read_simulator/utils/generate_reads.py b/neat/read_simulator/utils/generate_reads.py index 73976665..c68c20de 100644 --- a/neat/read_simulator/utils/generate_reads.py +++ b/neat/read_simulator/utils/generate_reads.py @@ -260,7 +260,7 @@ def generate_reads(reference: SeqRecord, options, fraglen_model, ) - _LOG.debug(f"Dataset coverage took: {time.process_time() - t}") + _LOG.debug(f"Dataset coverage took: {(time.process_time() - t)/60} m") # Filters left to apply: N filter... not sure yet what to do with those (output random low quality reads, maybe) # Target regions @@ -337,22 +337,22 @@ def generate_reads(reference: SeqRecord, # If only read1 or read2 is absent, this is a singleton read1_is_singleton = False read2_is_singleton = False + properly_paired = False if not any(read2): # Note that this includes all single ended reads that passed filter read1_is_singleton = True - singletons.append(raw_read) elif not any(read1): + # read1 got filtered out read2_is_singleton = True - singletons.append(raw_read) else: - properly_paired_reads.append(raw_read) + properly_paired = True read_name = f'{base_name}-{str(i)}' # If the other read is marked as a singleton, then this one was filtered out, or these are single-ended if not read2_is_singleton: # It's properly paired if it's not a singleton - is_properly_paired = not read1_is_singleton + is_paired = not read1_is_singleton # add a small amount of padding to the end to account for deletions # 30 is basically arbitrary. This may be a function of read length or determinable? # Though we don't figure out the variants until later @@ -365,7 +365,7 @@ def generate_reads(reference: SeqRecord, position=read1[0] + ref_start, end_point=read1[1] + ref_start, quality_offset=options.quality_offset, - is_paired=is_properly_paired + is_paired=is_paired ) read1.mutations = find_applicable_mutations(read1, contig_variants) @@ -373,7 +373,7 @@ def generate_reads(reference: SeqRecord, # if read1 is a sinleton then these are single-ended reads or this one was filtered out, se we skip if not read1_is_singleton: - is_properly_paired = not read2_is_singleton + is_paired = not read2_is_singleton # Because this segment reads back to front, we need padding at the beginning. # 30 is arbitrary. This may be a function of read length or determinable? # Though we don't figure out the variants until later @@ -387,45 +387,32 @@ def generate_reads(reference: SeqRecord, end_point=read2[1] + ref_start, quality_offset=options.quality_offset, is_reverse=True, - is_paired=is_properly_paired + is_paired=is_paired ) read2.mutations = find_applicable_mutations(read2, contig_variants) - read2.finalize_read_and_write(error_model_2, fq2, options.produce_fastq) - - _LOG.debug(f"Fastqs written in: {time.process_time() - t} s") - - #TODO figure this bit out - if options.produce_bam: - # Save this for later - raw_sam_dict[final_order_read_index].append((read1, read1_is_singleton)) - - if options.produce_bam: - # Save this for later - raw_sam_dict[final_order_read_index].append((read2, read2_is_singleton)) + read2.finalize_read_and_write(error_model_2, mutation_model, fq2, options.produce_fastq) + if properly_paired: + properly_paired_reads.append((read1, read2)) + elif read1_is_singleton: + # This will be the choice for all single-ended reads + singletons.append(read1) + else: + singletons.append(read2) - # This is the order of the reads as they - sam_dict_order = [] - raw_sam_dict = {} - if options.paired_ended: - # This is the ordered the reads will be in when sorted in the final sam file, if we're doing that. They are - # easier to sort now as sets of coordinates than they will be later as lines in a file. - sam_dict_order = sorted(reads, key=lambda element: (element[0:])) + _LOG.debug(f"Fastqs written in: {(time.process_time() - t)/60:.2f} m") if options.produce_bam: - # This index gives the position of the read for the final bam file - # [0][0] gives us the first index where this occurs (which should be the only one) - final_order_read_index = np.where(sam_dict_order == reads[i])[0][0] - if final_order_read_index not in raw_sam_dict: - raw_sam_dict[final_order_read_index] = [] - + # this will give us the proper read order of the elements, for the sam. They are easier to sort now + properly_paired_reads = sorted(properly_paired_reads) + singletons = sorted(singletons) + sam_order = properly_paired_reads + singletons - if options.produce_bam: with open_output(reads_pickle) as reads: - pickle.dump(raw_sam_dict, reads) + pickle.dump(sam_order, reads) - if options.paired_ended: - _LOG.debug(f"Paired percentage = {len(reads)/len(sam_dict_order)}") + if options.paired_ended: + _LOG.debug(f"Properly paired percentage = {len(properly_paired_reads)/len(sam_order)}") - _LOG.info(f"Finished sampling reads in {time.time() - start} seconds") + _LOG.info(f"Finished sampling reads in {(time.process_time() - start)/60:.2f} m") return chrom_fastq_r1, chrom_fastq_r2 diff --git a/neat/read_simulator/utils/read.py b/neat/read_simulator/utils/read.py index 7ffcf722..bebe95ea 100644 --- a/neat/read_simulator/utils/read.py +++ b/neat/read_simulator/utils/read.py @@ -12,7 +12,7 @@ from bisect import bisect_right from typing import TextIO -from Bio.Seq import Seq +from Bio.Seq import Seq, MutableSeq from Bio import pairwise2 from Bio.pairwise2 import format_alignment from numpy.random import Generator @@ -121,7 +121,6 @@ def update_quality_array( location: int, variant_type: str, quality_scores: list, - rng: Generator, quality_score: int = 0 ): """ @@ -139,21 +138,21 @@ def update_quality_array( if variant_type == "mutation": new_quality_score = [quality_score] * len(alternate) else: - # Since we have an error here, we'll choose a lower score + # Since we have an error here, we'll choose a min score original_quality_score = self.quality_array[location: location+len(reference)] new_quality_score = original_quality_score.copy() - # Generate a pool of low scores to choose from: - low_scores = quality_scores[:bisect_right(quality_score, max(quality_scores) // 3)] + low_score = min(quality_scores) # If is insertion if len(alternate) > 1: # Generate extra scores for insertions - new_quality_score.extend(rng.choice(low_scores, size=len(alternate)-1)) + # Original ref is unaffected, so it's quality score remains the same + new_quality_score = [low_score] * (len(alternate) - 1) # If is deletion elif len(reference) > 1 and len(alternate) == 1: - new_quality_score = new_quality_score[:1] + new_quality_score = [] # SNP else: - new_quality_score = rng.choice(low_scores, size=1) + new_quality_score = [low_score] # Replace the given quality score with the new one self.quality_array = \ @@ -161,7 +160,7 @@ def update_quality_array( np.array(new_quality_score), self.quality_array[location+len(reference):])) - def apply_errors(self, mutated_sequence, err_model): + def apply_errors(self, mutated_sequence: MutableSeq, err_model: SequencingErrorModel): """ This function applies errors to a sequence and calls the update_quality_array function after @@ -180,12 +179,11 @@ def apply_errors(self, mutated_sequence, err_model): error.location, "error", err_model.quality_scores, - err_model.rng ) return mutated_sequence - def apply_mutations(self, mutated_sequence: Seq, quality_scores: list, mut_model: MutationModel): + def apply_mutations(self, mutated_sequence: MutableSeq, quality_scores: list, mut_model: MutationModel): """ Applying mutations involves one extra step, because of polyploidism. There may be more than one mutation at a given location, so it is formulated as a list. We then pick one at random for this read. @@ -201,8 +199,8 @@ def apply_mutations(self, mutated_sequence: Seq, quality_scores: list, mut_model # Fetch parameters qual_score = variant_to_apply.get_qual_score() - # Find the position within the read for this variant - position = variant_to_apply.get_0_location() - self.position + # Find the position within the read for this variant, cast as a python int, instead of a numpy int + position = int(variant_to_apply.get_0_location() - self.position) # Figure out if the variant is in this read. If 'to_mutate' selects any 1, then it is mutated. to_mutate = mut_model.rng.choice(variant_to_apply.genotype) # If a 1 was selected, then apply the variant, else return the original sequence @@ -212,7 +210,12 @@ def apply_mutations(self, mutated_sequence: Seq, quality_scores: list, mut_model alternate = variant_to_apply.get_alt() elif type(variant_to_apply) == Deletion: reference_length = variant_to_apply.length - alternate = mutated_sequence[position] + try: + alternate = mutated_sequence[position] + except: + print(f"sequence: {mutated_sequence}") + print(f"position: {position}") + raise else: reference_length = variant_to_apply.get_ref_len() alternate = mutated_sequence.get_alt() @@ -226,7 +229,6 @@ def apply_mutations(self, mutated_sequence: Seq, quality_scores: list, mut_model location, "mutation", quality_scores, - mut_model.rng, qual_score ) @@ -245,9 +247,16 @@ def apply_variants_for_final_output( :return: the mutated sequence """ - # TODO Add filtering for bed files and heterozygosity right here + rng = mutation_model.rng # The read sequence should have some padding still at this point - read_sequence = self.read_sequence.seq + read_sequence = MutableSeq(self.read_sequence.seq) + # Deal with N's + for i in range(len(read_sequence)): + if read_sequence[i] == 'N': + # for now replace 'N' masked regions with pure noise. We may refine this in time + read_sequence[i] = rng.choice(['A', 'C', 'G', 'T']) + self.quality_array[i] = min(error_model.quality_scores) + if self.mutations: read_sequence = self.apply_mutations(read_sequence, list(error_model.quality_scores), mutation_model) if self.errors: From ed51167bba4d2503386f6de2e9bcc252ffe862af Mon Sep 17 00:00:00 2001 From: Joshua Allen Date: Thu, 23 May 2024 12:36:03 -0500 Subject: [PATCH 4/8] The changed code seems to be working. Will run a few more tests before issuing the PR --- neat/cli/cli.py | 2 +- neat/read_simulator/runner.py | 5 +- neat/read_simulator/utils/generate_reads.py | 115 +++++++++------ .../utils/output_file_writer.py | 137 +++++++++++------- neat/read_simulator/utils/read.py | 28 ++-- 5 files changed, 174 insertions(+), 113 deletions(-) diff --git a/neat/cli/cli.py b/neat/cli/cli.py index 826a13d7..848c8fd2 100644 --- a/neat/cli/cli.py +++ b/neat/cli/cli.py @@ -140,7 +140,7 @@ def main(parser: argparse.ArgumentParser, arguments: list[str]) -> int: else: end = time.time() log.info( - f"command finished successfully; execution took {(end - start)/60:.2f} sec" + f"command finished successfully; execution took {(end - start)/60:.2f} m" ) return 0 diff --git a/neat/read_simulator/runner.py b/neat/read_simulator/runner.py index cf70a4f5..0c6cd447 100644 --- a/neat/read_simulator/runner.py +++ b/neat/read_simulator/runner.py @@ -331,7 +331,7 @@ def read_simulator_runner(config: str, output: str): fasta_files.extend(local_fasta_file) if options.produce_fastq or options.produce_bam: - read1_fastq, read2_fastq = \ + read1_fastq_paired, read1_fastq_single, read2_fastq_paired, read2_fastq_single = \ generate_reads(local_reference, local_bam_pickle_file, seq_error_model_1, @@ -346,7 +346,8 @@ def read_simulator_runner(config: str, output: str): options, contig) - fastq_files.append((read1_fastq, read2_fastq)) + contig_temp_fastqs = ((read1_fastq_paired, read2_fastq_paired), (read1_fastq_single, read2_fastq_single)) + fastq_files.append(contig_temp_fastqs) if options.produce_bam: sam_reads_files.append(local_bam_pickle_file) diff --git a/neat/read_simulator/utils/generate_reads.py b/neat/read_simulator/utils/generate_reads.py index c68c20de..c158f87c 100644 --- a/neat/read_simulator/utils/generate_reads.py +++ b/neat/read_simulator/utils/generate_reads.py @@ -16,6 +16,8 @@ from ...variants import ContigVariants from .read import Read +# TODO check that we're not truncating reads with deletions, but getting a full 151 bases + __all__ = [ 'generate_reads', 'cover_dataset', @@ -245,13 +247,20 @@ def generate_reads(reference: SeqRecord, :return: A tuple of the filenames for the temp files created """ # Set up files for use. May not need r2, but it's there if we do. - chrom_fastq_r1 = temporary_directory / f'{chrom}_tmp_r1.fq.bgz' - chrom_fastq_r2 = temporary_directory / f'{chrom}_tmp_r2.fq.bgz' + # We will separate the properly paired and the singletons. + # For now, we are making an assumption that the chromosome name contains no invalid characters for bash file names + # such as `*` or `:` even though those are technically allowed. + # We need to insure "chrom" does not contain `_` but we will assume that is the case for now. + # TODO We'll need to add some checks to ensure that this is the case. + chrom_fastq_r1_paired = temporary_directory / f'{chrom}_r1_paired.fq.bgz' + chrom_fastq_r1_single = temporary_directory / f'{chrom}_r1_single.fq.bgz' + chrom_fastq_r2_paired = temporary_directory / f'{chrom}_r2_paired.fq.bgz' + chrom_fastq_r2_single = temporary_directory / f'{chrom}_r2_single.fq.bgz' _LOG.info(f'Sampling reads...') - start = time.time() + start_time = time.time() - base_name = f'{Path(options.output).name}-{chrom}' + base_name = f'NEAT-generated_{chrom}' _LOG.debug("Covering dataset.") t = time.process_time() @@ -260,7 +269,7 @@ def generate_reads(reference: SeqRecord, options, fraglen_model, ) - _LOG.debug(f"Dataset coverage took: {(time.process_time() - t)/60} m") + _LOG.debug(f"Dataset coverage took: {(time.process_time() - t)/60:.2f} m") # Filters left to apply: N filter... not sure yet what to do with those (output random low quality reads, maybe) # Target regions @@ -276,8 +285,10 @@ def generate_reads(reference: SeqRecord, _LOG.debug("Writing fastq(s) and optional tsam, if indicated") t = time.process_time() with ( - open_output(chrom_fastq_r1) as fq1, - open_output(chrom_fastq_r2) as fq2 + open_output(chrom_fastq_r1_paired) as fq1_paired, + open_output(chrom_fastq_r1_single) as fq1_single, + open_output(chrom_fastq_r2_paired) as fq2_paired, + open_output(chrom_fastq_r2_single) as fq2_single ): for i in range(len(reads)): @@ -296,10 +307,10 @@ def generate_reads(reference: SeqRecord, if overlaps(read1, (region[0], region[1])): found_read1 = True # Again, make sure it hasn't been filtered, or this is a single-ended read - elif any(read2): + if any(read2): if overlaps(read2, (region[0], region[1])): found_read2 = True - # This read was outside of targeted regions + # This read was outside targeted regions if not found_read1: # Filter out this read read1 = (0, 0) @@ -347,7 +358,7 @@ def generate_reads(reference: SeqRecord, else: properly_paired = True - read_name = f'{base_name}-{str(i)}' + read_name = f'{base_name}_{str(i)}' # If the other read is marked as a singleton, then this one was filtered out, or these are single-ended if not read2_is_singleton: @@ -356,20 +367,27 @@ def generate_reads(reference: SeqRecord, # add a small amount of padding to the end to account for deletions # 30 is basically arbitrary. This may be a function of read length or determinable? # Though we don't figure out the variants until later - segment = reference[read1[0]: read1[1] + 30] - read1 = Read(name=read_name + "/1", - raw_read=raw_read, - reference_segment=segment, - reference_id=reference.id, - position=read1[0] + ref_start, - end_point=read1[1] + ref_start, - quality_offset=options.quality_offset, - is_paired=is_paired - ) - - read1.mutations = find_applicable_mutations(read1, contig_variants) - read1.finalize_read_and_write(error_model_1, mutation_model, fq1, options.produce_fastq) + # this start check ensures that we get a valid segment + segment = reference[read1[0]: read1[1] + 30].seq + + read_1 = Read( + name=read_name + "/1", + raw_read=raw_read, + reference_segment=segment, + reference_id=reference.id, + position=read1[0] + ref_start, + end_point=read1[1] + ref_start, + quality_offset=options.quality_offset, + is_paired=is_paired + ) + + read_1.mutations = find_applicable_mutations(read_1, contig_variants) + if is_paired: + handle = fq1_paired + else: + handle = fq1_single + read_1.finalize_read_and_write(error_model_1, mutation_model, handle, options.produce_fastq) # if read1 is a sinleton then these are single-ended reads or this one was filtered out, se we skip if not read1_is_singleton: @@ -377,30 +395,39 @@ def generate_reads(reference: SeqRecord, # Because this segment reads back to front, we need padding at the beginning. # 30 is arbitrary. This may be a function of read length or determinable? # Though we don't figure out the variants until later - segment = reference[read2[0] - 30: read2[1]] - - read2 = Read(name=read_name + "/2", - raw_read=reads[i], - reference_segment=segment, - reference_id=reference.id, - position=read2[0] + ref_start, - end_point=read2[1] + ref_start, - quality_offset=options.quality_offset, - is_reverse=True, - is_paired=is_paired - ) - - read2.mutations = find_applicable_mutations(read2, contig_variants) - read2.finalize_read_and_write(error_model_2, mutation_model, fq2, options.produce_fastq) + + # this start check ensures that we get a valid segment + start_cooridnate = max((read2[0] - 30), 0) + segment = reference[start_cooridnate: read2[1]].seq + + read_2 = Read( + name=read_name + "/2", + raw_read=reads[i], + reference_segment=segment, + reference_id=reference.id, + position=read2[0] + ref_start, + end_point=read2[1] + ref_start, + quality_offset=options.quality_offset, + is_reverse=True, + is_paired=is_paired + ) + + read_2.mutations = find_applicable_mutations(read_2, contig_variants) + if is_paired: + handle = fq2_paired + else: + handle = fq2_single + read_2.finalize_read_and_write(error_model_2, mutation_model, handle, options.produce_fastq) + if properly_paired: - properly_paired_reads.append((read1, read2)) + properly_paired_reads.append((read_1, read_2)) elif read1_is_singleton: # This will be the choice for all single-ended reads - singletons.append(read1) + singletons.append((read_1, None)) else: - singletons.append(read2) + singletons.append((None, read_2)) - _LOG.debug(f"Fastqs written in: {(time.process_time() - t)/60:.2f} m") + _LOG.debug(f"Contig fastq(s) written in: {(time.process_time() - t)/60:.2f} m") if options.produce_bam: # this will give us the proper read order of the elements, for the sam. They are easier to sort now @@ -414,5 +441,5 @@ def generate_reads(reference: SeqRecord, if options.paired_ended: _LOG.debug(f"Properly paired percentage = {len(properly_paired_reads)/len(sam_order)}") - _LOG.info(f"Finished sampling reads in {(time.process_time() - start)/60:.2f} m") - return chrom_fastq_r1, chrom_fastq_r2 + _LOG.info(f"Finished sampling reads in {(time.time() - start_time)/60:.2f} m") + return chrom_fastq_r1_paired, chrom_fastq_r1_single, chrom_fastq_r2_paired, chrom_fastq_r2_single diff --git a/neat/read_simulator/utils/output_file_writer.py b/neat/read_simulator/utils/output_file_writer.py index c065ea46..5958486f 100755 --- a/neat/read_simulator/utils/output_file_writer.py +++ b/neat/read_simulator/utils/output_file_writer.py @@ -191,68 +191,97 @@ def merge_temp_fastqs( will be no IO time. :param fastq_files: The temporary fastq files to combine in the final output, or to set the order for the - final bam + final bam. Each set of files has two subsets, the paired and the singles. We'll do all the paired first + then append any singletons :param paired_ended: whether this run is paired or single ended. :param rand_num_gen: the random number generator for the run """ - fastq_indexes = [] - read1_keys = [] - read2_keys = [] - - # Index the temp fastqs - for file_pair in fastq_files: + fastq_index_dict = {} + # read1_keys will hold all the paired r1 paired ended reads (if any) and all singleton reads + paired_keys = [] + singleton_keys = [] + + paired_files = [] + singleton_files = [] + + # First split the files into 2 categories. For paired ended reads, most reads will be paired, though some may + # have gotten filtered out with input files. We will append any singletons in paired ended mode at the end, + # Or else for single ended we will end up just writing the singletons. + for coupled_files in fastq_files: + paired_file_names, singleton_file_names = coupled_files + paired_files.append(paired_file_names) + singleton_files.append(singleton_file_names) + + # Index the temp paired-ended fastqs + for file_pair in paired_files: file1_index = SeqIO.index(str(file_pair[0]), 'fastq') - file2_index = None - if file_pair[1]: - file2_index = SeqIO.index(str(file_pair[1]), 'fastq') - read2_keys.append(list(file2_index)) + file2_index = SeqIO.index(str(file_pair[1]), 'fastq') + # Reconstruct the name of the reads + contig_name = Path(file_pair[0]).name.removesuffix('_r1_paired.fq.bgz') + # Either both will have data, or neither, so checking one is sufficient + if file1_index: + if contig_name not in fastq_index_dict: + fastq_index_dict[contig_name] = [] + # 1 and 2 for read 1 and read 2 + fastq_index_dict[contig_name] = {1: file1_index, 2: file2_index} + paired_keys.extend(list(zip(file1_index, file2_index))) + + # Index the singletons, or for single-ended reads, all reads + for file_pair in singleton_files: + if file_pair[0]: + file_index = SeqIO.index(str(file_pair[0]), 'fastq') + contig_name = Path(file_pair[0]).name.strip('_r1_single.fq.bgz') + elif file_pair[1]: + file_index = SeqIO.index(str(file_pair[1]), 'fastq') + contig_name = Path(file_pair[1]).name.strip('_r2_single.fq.bgz') else: - read2_keys.append(file2_index) - fastq_indexes.append((file1_index, file2_index)) - read1_keys.append(list(file1_index)) - - # Flatten keys lists to shuffle the order - all_read1_keys = [key for sublist in read1_keys for key in sublist] - all_read2_keys = [key for sublist in read2_keys for key in sublist] - + # So singletons for this contig, so move on + continue + + # A check in case all reads were properly paired and there are no singletons + if file_index: + if contig_name not in fastq_index_dict: + fastq_index_dict[contig_name] = [] + # To keep the data structure consistent, we point both keys at the same file + fastq_index_dict[contig_name] = {1: file_index, 2: file_index} + singleton_keys.extend(list(file_index)) + + shuffled_paired_keys = paired_keys.copy() + shuffled_singleton_keys = singleton_keys.copy() # Shuffle the keys - rand_num_gen.shuffle(all_read1_keys) - - read2_singletons = [key for key in all_read2_keys if key not in all_read1_keys] + rand_num_gen.shuffle(shuffled_paired_keys) + rand_num_gen.shuffle(shuffled_singleton_keys) + # So we can delete later + wrote_r2 = False with ( open_output(self.fastq1_fn) as fq1, open_output(self.fastq2_fn) as fq2 ): - for i in range(len(all_read1_keys)): - - current_key = all_read1_keys[i] - - # This gives us the first index of the 'row' in the keys table where this read is located, - # giving us the list index of the file index where I can find this read - current_index = [read1_keys.index(x) for x in read1_keys if current_key in x][0] - # This is the index of the read1 file, at the current key, which should be the read we want - read1 = fastq_indexes[current_index][0][current_key] + # First we add all properly paired reads + for i in range(len(shuffled_paired_keys)): + current_key = shuffled_paired_keys[i] + # reconstruct tho chromosome name + chrom_name = current_key[0].removeprefix("NEAT-generated_").split('_')[0] + # 1 here because this is read1 + read1 = fastq_index_dict[chrom_name][1][current_key[0]] SeqIO.write(read1, fq1, 'fastq') - - if current_key in fastq_indexes[current_index][1]: - read2 = fastq_indexes[current_index][1][current_key] - SeqIO.write(read2, fq2, 'fastq') - - for j in range(len(read2_singletons)): - - current_key = read2_singletons[j] - - current_index = [read2_keys.index(x) for x in read2_keys if current_key in x][0] - read = fastq_indexes[current_index][1][current_key] - SeqIO.write(read, fq2, 'fastq') - - if not paired_ended: - + # 2 for read2 + read2 = fastq_index_dict[chrom_name][2][current_key[1]] + SeqIO.write(read2, fq2, 'fastq') + if not wrote_r2: + wrote_r2 = True + + # Next we add the strays (or all reads, for single-ended) + for j in range(len(shuffled_singleton_keys)): + current_key = shuffled_singleton_keys[j] + chrom_name = current_key[0].removeprefix("NEAT_generated_").split('_')[0] + read = fastq_index_dict[chrom_name][1][current_key[0]] + SeqIO.write(read, fq1, 'fastq') + + if not wrote_r2: fastq2_path = Path(self.fastq2_fn) - - if fastq2_path.exists(): - fastq2_path.unlink() + fastq2_path.unlink() def output_bam_file(self, reads_files: list, contig_dict: dict): """ @@ -263,7 +292,7 @@ def output_bam_file(self, reads_files: list, contig_dict: dict): :param contig_dict: A dictionary with the keys as contigs from the reference, and the values the index of that contig """ - + # TODO incorporate new read list (no longer a dictionary) from generate_reads pickle file bam_out = bgzf.BgzfWriter(self.bam_fn, 'w', compresslevel=BAM_COMPRESSION_LEVEL) bam_out.write("BAM\1") header = "@HD\tVN:1.4\tSO:coordinate\n" @@ -284,9 +313,9 @@ def output_bam_file(self, reads_files: list, contig_dict: dict): for file in reads_files: contig_reads_data = pickle.load(gzip.open(file)) - for key in contig_reads_data: - read1 = contig_reads_data[key][0] - read2 = contig_reads_data[key][0] + for read_data in contig_reads_data: + read1 = read_data[0] + read2 = read_data[1] if read1: self.write_bam_record(read1, contig_dict[read1.reference_id], bam_out) @@ -317,7 +346,7 @@ def write_bam_record(self, read: Read, contig_id: int, bam_handle: bgzf.BgzfWrit next_ref_id = contig_id - if mate_position is None: + if not mate_position: next_pos = 0 template_length = 0 else: diff --git a/neat/read_simulator/utils/read.py b/neat/read_simulator/utils/read.py index bebe95ea..70399939 100644 --- a/neat/read_simulator/utils/read.py +++ b/neat/read_simulator/utils/read.py @@ -130,7 +130,6 @@ def update_quality_array( :param location: The first position of the mutation, in 0-based coordinates :param variant_type: Either "mutations" or "errors" :param quality_scores: The possible quality scores, used to adjust for mutations and errors - :param rng: The random number generator for the run :param quality_score: The quality score to use, since this has already been calculated for mutations. This must be included if type is 'mutation' :return: None, updates the quality array in place @@ -178,7 +177,7 @@ def apply_errors(self, mutated_sequence: MutableSeq, err_model: SequencingErrorM error.alt, error.location, "error", - err_model.quality_scores, + list(err_model.quality_scores), ) return mutated_sequence @@ -210,12 +209,7 @@ def apply_mutations(self, mutated_sequence: MutableSeq, quality_scores: list, mu alternate = variant_to_apply.get_alt() elif type(variant_to_apply) == Deletion: reference_length = variant_to_apply.length - try: - alternate = mutated_sequence[position] - except: - print(f"sequence: {mutated_sequence}") - print(f"position: {position}") - raise + alternate = mutated_sequence[position] else: reference_length = variant_to_apply.get_ref_len() alternate = mutated_sequence.get_alt() @@ -249,7 +243,7 @@ def apply_variants_for_final_output( rng = mutation_model.rng # The read sequence should have some padding still at this point - read_sequence = MutableSeq(self.read_sequence.seq) + read_sequence = MutableSeq(self.read_sequence) # Deal with N's for i in range(len(read_sequence)): if read_sequence[i] == 'N': @@ -353,8 +347,13 @@ def make_cigar(self): # These parameters were set to minimize breaks in the mutated sequence and find the best # alignment from there. + if self.is_reverse: + read_sequence = self.read_sequence.reverse_complement().upper() + else: + read_sequence = self.read_sequence.upper() + reference_segment = self.reference_segment.upper() raw_alignment = pairwise2.align.localms( - self.reference_segment, self.read_sequence, match=1, mismatch=-1, open=-0.5, extend=-0.1, + reference_segment, read_sequence, match=1, mismatch=-1, open=-0.5, extend=-0.1, penalize_extend_when_opening=True ) @@ -366,7 +365,8 @@ def make_cigar(self): cig_string = '' # Find first match start = min([aligned_mut_seq.find(x) for x in ALLOWED_NUCL]) - for char in range(start, start + len(self.read_sequence)): + end = start + len(read_sequence) + for char in range(start, end): if aligned_template_seq[char] == '-': # insertion if curr_char == 'I': # more insertions cig_count = cig_count + 1 @@ -391,7 +391,11 @@ def make_cigar(self): cig_string = cig_string + str(cig_count) + curr_char curr_char = 'M' cig_count = 1 - return cig_string + str(cig_count) + curr_char + # A check to make sure this doesn't end on a D + if curr_char == 'D': + return cig_string + str(cig_count) + curr_char + str(cig_count) + 'M' + else: + return cig_string + str(cig_count) + curr_char def get_mpos(self): """ From db2436c123310df000c8b11086efab666e9652b4 Mon Sep 17 00:00:00 2001 From: Joshua Allen Date: Thu, 23 May 2024 19:19:58 -0500 Subject: [PATCH 5/8] Added some padding to handle edge case reads at the tail end of a chromosome. --- neat/models/models.py | 12 +++- neat/read_simulator/runner.py | 2 +- neat/read_simulator/utils/generate_reads.py | 22 +++++-- .../utils/output_file_writer.py | 16 +++-- neat/read_simulator/utils/read.py | 64 ++++++++++++------- 5 files changed, 77 insertions(+), 39 deletions(-) diff --git a/neat/models/models.py b/neat/models/models.py index e8f37559..258a74f8 100644 --- a/neat/models/models.py +++ b/neat/models/models.py @@ -388,16 +388,17 @@ def __init__(self, def get_sequencing_errors(self, read_length: int, + padding: int, reference_segment: SeqRecord, quality_scores: np.ndarray): """ Inserts errors of type substitution, insertion, or deletion into read_data, and assigns a quality score based on the container model. :param read_length: The length of the read to generate errors for. + :param padding: this is the amount of space we have in the read for deletions. :param reference_segment: The section of the reference from which the read is drawn :param quality_scores: Array of quality scores for the read - :return: modified sequence and associated quality scores - + :return: Modified sequence and associated quality scores """ error_indexes = [] @@ -432,6 +433,10 @@ def get_sequencing_errors(self, # Deletion error if error_type == Deletion: deletion_length = self.get_deletion_length() + if padding - deletion_length < 0: + # No more space in this read to add deletions + padding = 0 + continue deletion_reference = reference_segment.seq[index: index + deletion_length + 1] deletion_alternate = deletion_reference[0] introduced_errors.append( @@ -439,6 +444,7 @@ def get_sequencing_errors(self, ) num_indels_so_far += deletion_length del_blacklist.extend(list(range(index, index + deletion_length))) + padding -= deletion_length elif error_type == Insertion: insertion_length = self.get_insertion_length() @@ -467,7 +473,7 @@ def get_sequencing_errors(self, if introduced_errors[i].location in del_blacklist: del introduced_errors[i] - return introduced_errors + return introduced_errors, max(padding, 0) def quality_index_remap(self, input_read_length): """ diff --git a/neat/read_simulator/runner.py b/neat/read_simulator/runner.py index 0c6cd447..7b408944 100644 --- a/neat/read_simulator/runner.py +++ b/neat/read_simulator/runner.py @@ -371,4 +371,4 @@ def read_simulator_runner(config: str, output: str): _LOG.info(f"Outputting golden bam file: {str(output_file_writer.bam_fn)}") contig_list = list(reference_index) contigs_by_index = {contig_list[n]: n for n in range(len(contig_list))} - output_file_writer.output_bam_file(sam_reads_files, contigs_by_index) + output_file_writer.output_bam_file(sam_reads_files, contigs_by_index, options.read_len) diff --git a/neat/read_simulator/utils/generate_reads.py b/neat/read_simulator/utils/generate_reads.py index c158f87c..05a4f2c6 100644 --- a/neat/read_simulator/utils/generate_reads.py +++ b/neat/read_simulator/utils/generate_reads.py @@ -358,8 +358,10 @@ def generate_reads(reference: SeqRecord, else: properly_paired = True - read_name = f'{base_name}_{str(i)}' + read_name = f'{base_name}_{str(i+1)}' + # Initialize variable to track padding + padding = 0 # If the other read is marked as a singleton, then this one was filtered out, or these are single-ended if not read2_is_singleton: # It's properly paired if it's not a singleton @@ -368,8 +370,9 @@ def generate_reads(reference: SeqRecord, # 30 is basically arbitrary. This may be a function of read length or determinable? # Though we don't figure out the variants until later - # this start check ensures that we get a valid segment - segment = reference[read1[0]: read1[1] + 30].seq + # upper ensures that we get a segment with NEAT-recognized bases + segment = reference[read1[0]: read1[1] + 30].seq.upper() + padding = len(segment) - options.read_len read_1 = Read( name=read_name + "/1", @@ -378,6 +381,7 @@ def generate_reads(reference: SeqRecord, reference_id=reference.id, position=read1[0] + ref_start, end_point=read1[1] + ref_start, + padding=padding, quality_offset=options.quality_offset, is_paired=is_paired ) @@ -396,9 +400,14 @@ def generate_reads(reference: SeqRecord, # 30 is arbitrary. This may be a function of read length or determinable? # Though we don't figure out the variants until later - # this start check ensures that we get a valid segment - start_cooridnate = max((read2[0] - 30), 0) - segment = reference[start_cooridnate: read2[1]].seq + # If we had a certain amount of padding from read_1, we'll use that here + if padding: + start_coordinate = max((read2[0] - padding), 0) + else: + start_coordinate = max((read2[0] - 30), 0) + # upper ensures that we get a segment with NEAT-recognized bases + segment = reference[start_coordinate: read2[1]].seq.upper() + padding = len(segment) - options.read_len read_2 = Read( name=read_name + "/2", @@ -407,6 +416,7 @@ def generate_reads(reference: SeqRecord, reference_id=reference.id, position=read2[0] + ref_start, end_point=read2[1] + ref_start, + padding=padding, quality_offset=options.quality_offset, is_reverse=True, is_paired=is_paired diff --git a/neat/read_simulator/utils/output_file_writer.py b/neat/read_simulator/utils/output_file_writer.py index 5958486f..93a87912 100755 --- a/neat/read_simulator/utils/output_file_writer.py +++ b/neat/read_simulator/utils/output_file_writer.py @@ -283,7 +283,7 @@ def merge_temp_fastqs( fastq2_path = Path(self.fastq2_fn) fastq2_path.unlink() - def output_bam_file(self, reads_files: list, contig_dict: dict): + def output_bam_file(self, reads_files: list, contig_dict: dict, read_length: int): """ This section is for producing a CIGAR string using a temp sam file(sam file with original sequence instead of a cigar string) @@ -291,6 +291,7 @@ def output_bam_file(self, reads_files: list, contig_dict: dict): :param reads_files: The list of temp sams to combine :param contig_dict: A dictionary with the keys as contigs from the reference, and the values the index of that contig + :param read_length: the length of the reads for this run """ # TODO incorporate new read list (no longer a dictionary) from generate_reads pickle file bam_out = bgzf.BgzfWriter(self.bam_fn, 'w', compresslevel=BAM_COMPRESSION_LEVEL) @@ -317,19 +318,20 @@ def output_bam_file(self, reads_files: list, contig_dict: dict): read1 = read_data[0] read2 = read_data[1] if read1: - self.write_bam_record(read1, contig_dict[read1.reference_id], bam_out) + self.write_bam_record(read1, contig_dict[read1.reference_id], bam_out, read_length) if read2: - self.write_bam_record(read2, contig_dict[read2.reference_id], bam_out) + self.write_bam_record(read2, contig_dict[read2.reference_id], bam_out, read_length) bam_out.close() - def write_bam_record(self, read: Read, contig_id: int, bam_handle: bgzf.BgzfWriter): + def write_bam_record(self, read: Read, contig_id: int, bam_handle: bgzf.BgzfWriter, read_length: int): """ Takes a read object and writes it out as a bam record :param read: a read object containing everything we need to write it out. :param contig_id: the index of the reference for this :param bam_handle: the handle of the file object to write to. + :param read_length: the length of the read to output """ read_bin = reg2bin(read.position, read.end_point) @@ -356,8 +358,8 @@ def write_bam_record(self, read: Read, contig_id: int, bam_handle: bgzf.BgzfWrit for i in range(cig_ops): encoded_cig.extend(pack(' Date: Thu, 23 May 2024 22:02:21 -0500 Subject: [PATCH 6/8] Working on ironing out same variables --- neat/models/models.py | 3 - neat/read_simulator/utils/generate_reads.py | 27 +---- neat/read_simulator/utils/read.py | 103 ++++++++++++-------- 3 files changed, 66 insertions(+), 67 deletions(-) diff --git a/neat/models/models.py b/neat/models/models.py index 258a74f8..8d61b4db 100644 --- a/neat/models/models.py +++ b/neat/models/models.py @@ -418,9 +418,6 @@ def get_sequencing_errors(self, del_blacklist = [] for index in error_indexes[::-1]: - # Skip any N's - if reference_segment[index] == 'N': - continue # determine error type. Most will be SNVs error_type = SingleNucleotideVariant diff --git a/neat/read_simulator/utils/generate_reads.py b/neat/read_simulator/utils/generate_reads.py index 05a4f2c6..6f5165cd 100644 --- a/neat/read_simulator/utils/generate_reads.py +++ b/neat/read_simulator/utils/generate_reads.py @@ -154,26 +154,6 @@ def find_applicable_mutations(my_read: Read, all_variants: ContigVariants) -> di return return_dict -def replace_n(segment: Seq, rng: Generator) -> Seq: - """ - Replaces invalid characters with random valid characters - - :param segment: A Seq object containing a DNA sequence - :param rng: The random number generator for the run - :return: The modified sequence object - """ - modified_segment = "" - # This takes care of soft masking - segment_no_mask = segment.upper() - for base in segment_no_mask: - if base in ALLOWED_NUCL: - modified_segment += base - else: - modified_segment += rng.choice(ALLOWED_NUCL) - - return Seq(modified_segment) - - def merge_sort(my_array: np.ndarray): """ This sorts the reads in reverse position, merging as it goes, in order to get the proper order @@ -369,9 +349,8 @@ def generate_reads(reference: SeqRecord, # add a small amount of padding to the end to account for deletions # 30 is basically arbitrary. This may be a function of read length or determinable? # Though we don't figure out the variants until later + segment = reference[read1[0]: read1[1] + 30].seq - # upper ensures that we get a segment with NEAT-recognized bases - segment = reference[read1[0]: read1[1] + 30].seq.upper() padding = len(segment) - options.read_len read_1 = Read( @@ -405,8 +384,8 @@ def generate_reads(reference: SeqRecord, start_coordinate = max((read2[0] - padding), 0) else: start_coordinate = max((read2[0] - 30), 0) - # upper ensures that we get a segment with NEAT-recognized bases - segment = reference[start_coordinate: read2[1]].seq.upper() + # this ensures that we get a segment with NEAT-recognized bases + segment = reference[start_coordinate: read2[1]].seq padding = len(segment) - options.read_len read_2 = Read( diff --git a/neat/read_simulator/utils/read.py b/neat/read_simulator/utils/read.py index f55c188c..8533ab63 100644 --- a/neat/read_simulator/utils/read.py +++ b/neat/read_simulator/utils/read.py @@ -162,18 +162,18 @@ def update_quality_array( np.array(new_quality_score), self.quality_array[location+len(reference):])) - def apply_errors(self, mutated_sequence: MutableSeq, err_model: SequencingErrorModel): + def apply_errors(self, err_model: SequencingErrorModel): """ This function applies errors to a sequence and calls the update_quality_array function after - :param mutated_sequence: The sequence to add errors to. :param err_model: The error model for this run, :return: None, The sequence, with errors applied """ + mutated_sequence = "" for error in self.errors: # Replace the entire ref sequence with the entire alt sequence mutated_sequence = \ - mutated_sequence[:error.location] + error.alt + mutated_sequence[error.location+len(error.ref):] + self.read_sequence[:error.location] + error.alt + self.read_sequence[error.location+len(error.ref):] # update quality score for error self.update_quality_array( error.ref, @@ -183,19 +183,18 @@ def apply_errors(self, mutated_sequence: MutableSeq, err_model: SequencingErrorM list(err_model.quality_scores), ) - return mutated_sequence + self.read_sequence = Seq(mutated_sequence) - def apply_mutations(self, mutated_sequence: MutableSeq, quality_scores: list, mut_model: MutationModel): + def apply_mutations(self, quality_scores: list, mut_model: MutationModel): """ Applying mutations involves one extra step, because of polyploidism. There may be more than one mutation at a given location, so it is formulated as a list. We then pick one at random for this read. - :param mutated_sequence: The sequence we are applying the variant to :param quality_scores: The possible quality scores for this run (to update quality scores) :param mut_model: The mutation model for the run, used for the zygosity and for the rng :return: mutated sequence, with mutations applied """ - + mutated_sequence = "" for location in self.mutations: variant_to_apply = mut_model.rng.choice(self.mutations[location]) @@ -218,10 +217,10 @@ def apply_mutations(self, mutated_sequence: MutableSeq, quality_scores: list, mu continue else: self.padding -= variant_to_apply.length - alternate = mutated_sequence[position] + alternate = self.read_sequence[position] else: reference_length = variant_to_apply.get_ref_len() - alternate = mutated_sequence.get_alt() + alternate = self.read_sequence.get_alt() # Replace the entire ref with the entire alt mutated_sequence = \ @@ -244,7 +243,7 @@ def apply_variants_for_final_output( self, error_model: SequencingErrorModel, mutation_model: MutationModel - ) -> Seq: + ): """ Gets mutated sequence to output for fastq/bam @@ -252,27 +251,19 @@ def apply_variants_for_final_output( :param mutation_model: The mutation model for the run :return: the mutated sequence """ - - rng = mutation_model.rng - # The read sequence should have some padding still at this point - read_sequence = MutableSeq(self.read_sequence) - # Deal with N's - for i in range(len(read_sequence)): - if read_sequence[i] == 'N': - # for now replace 'N' masked regions with pure noise. We may refine this in time - read_sequence[i] = rng.choice(['A', 'C', 'G', 'T']) - self.quality_array[i] = min(error_model.quality_scores) - + # I realize that it's a little weird to convert self.read_sequence to mutable then assign to a variable + # instead of operating on it in place, but while it is mutable, it was easier to treat it like a variable then + # reconvert and reassign. if self.mutations: - read_sequence = self.apply_mutations( - read_sequence, list(error_model.quality_scores), mutation_model - ) + self.apply_mutations(list(error_model.quality_scores), mutation_model) if self.errors: - read_sequence = self.apply_errors(read_sequence, error_model) + self.apply_errors(error_model) + # The segments we hold off truncating because we need them for alignments, + # but the quality array is fine to trim here. self.quality_array = self.quality_array[:self.length] - - return Seq(read_sequence) + # Convert back to immutable + self.read_sequence = Seq(self.read_sequence) def contains(self, test_pos: int): return self.position <= test_pos < self.end_point @@ -325,24 +316,21 @@ def finalize_read_and_write( """ # Generate quality scores to cover the extended segment. We'll trim later - raw_quality_array = err_model.get_quality_scores(len(self.reference_segment)) + self.quality_array = err_model.get_quality_scores(len(self.reference_segment)) - if self.is_reverse: - read_quality_array = raw_quality_array[::-1] - read_sequence = self.reference_segment.reverse_complement() - else: - read_quality_array = raw_quality_array - read_sequence = self.reference_segment + # This replaces either hard or soft-masked reference segment with upper case or a standard repeat + # It updates the quality array and reference segment in place, including reversing them, if appropriate + self.convert_masking(err_model) # Get errors for the rea and update the quality score - self.errors, self.padding = err_model.get_sequencing_errors(self.length, self.padding, read_sequence, - read_quality_array) + self.errors, self.padding = err_model.get_sequencing_errors(self.length, self.padding, self.reference_segment, + self.quality_array) - # Update the read as needed: - self.quality_array = read_quality_array - self.read_sequence = read_sequence + # Update the read sequence to match the reference + self.read_sequence = self.reference_segment - self.read_sequence = self.apply_variants_for_final_output(err_model, mut_model) + # This applies any variants, updates quality score and read sequence in place + self.apply_variants_for_final_output(err_model, mut_model) self.read_quality_string = "".join([chr(x + self.quality_offset) for x in self.quality_array]) self.mapping_quality = abs(err_model.rng.poisson(70)) @@ -353,6 +341,41 @@ def finalize_read_and_write( fastq_handle.write('+\n') fastq_handle.write(f'{self.read_quality_string}\n') + def convert_masking(self, error_model: SequencingErrorModel): + """ + Replaces invalid characters with random valid characters drawn from a standard repeat sequence seen in a lot + of different species (TTAGGG). If there is call for it, we can make this customizable to species. + + :param error_model: The error model for this run + :return: The modified sequence object + """ + bad_score = min(error_model.quality_scores) + # we'll use generic human repeats, as commonly found in masked regions. We may refine this to make configurable + repeat_bases = list("TTAGGG") + modified_segment = "" + modified_quality_array = self.quality_array.copy() + if self.is_reverse: + raw_segment = self.reference_segment.reverse_complement() + modified_quality_array = modified_quality_array[::-1] + else: + raw_segment = self.reference_segment + + for i in range(len(raw_segment)): + base = raw_segment[i] + if base.islower(): + modified_base = base.upper() + modified_quality_array[i] = bad_score + else: + modified_base = base + if modified_base in ALLOWED_NUCL: + modified_segment += modified_base + else: + modified_segment += repeat_bases[i % 6] + modified_quality_array[i] = bad_score + + self.reference_segment = Seq(modified_segment) + self.quality_array = modified_quality_array + def make_cigar(self): """ Aligns the reference and mutated sequences. From 0e11d20757032c7ed18e20e92100b81738282887 Mon Sep 17 00:00:00 2001 From: Joshua Allen Date: Fri, 24 May 2024 21:12:54 -0500 Subject: [PATCH 7/8] Okay, I found a few more bugs related to N regionsn. Not bugs, exactly, but edge cases dealing with masked bases in a fasta file. I solved those issues and need to test, but it is looking much more promising than when I started the day. --- neat/models/models.py | 12 +- neat/read_simulator/utils/generate_reads.py | 42 +++--- .../utils/output_file_writer.py | 20 ++- neat/read_simulator/utils/read.py | 130 ++++++++++-------- 4 files changed, 112 insertions(+), 92 deletions(-) diff --git a/neat/models/models.py b/neat/models/models.py index 8d61b4db..c66969fa 100644 --- a/neat/models/models.py +++ b/neat/models/models.py @@ -413,7 +413,7 @@ def get_sequencing_errors(self, if self.rng.random() < self.quality_score_error_rate[quality_scores[i]]: error_indexes.append(i) - num_indels_so_far = 0 + total_indel_length = 0 # To prevent deletion collisions del_blacklist = [] @@ -424,22 +424,22 @@ def get_sequencing_errors(self, # Not too sure about how realistic it is to model errors as indels, but I'm leaving the code in for now. # This is to prevent deletion error collisions and to keep there from being too many indel errors. - if 0 < index < self.read_length - max(self.deletion_len_model) and num_indels_so_far > self.read_length//2: + if 0 < index < self.read_length - max(self.deletion_len_model) and total_indel_length > self.read_length//4: error_type = self.rng.choice(a=list(self.variant_probs), p=list(self.variant_probs.values())) # Deletion error if error_type == Deletion: deletion_length = self.get_deletion_length() if padding - deletion_length < 0: - # No more space in this read to add deletions - padding = 0 + # No space in this read to add this deletion continue deletion_reference = reference_segment.seq[index: index + deletion_length + 1] deletion_alternate = deletion_reference[0] introduced_errors.append( ErrorContainer(Deletion, index, deletion_length, deletion_reference, deletion_alternate) ) - num_indels_so_far += deletion_length + total_indel_length += deletion_length + del_blacklist.extend(list(range(index, index + deletion_length))) padding -= deletion_length @@ -451,7 +451,7 @@ def get_sequencing_errors(self, introduced_errors.append( ErrorContainer(Insertion, index, insertion_length, insertion_reference, insertion_alternate) ) - num_indels_so_far += insertion_length + total_indel_length += insertion_length # Insert substitution error # Programmer note: if you add new error types, they can be added as elifs above, leaving the final diff --git a/neat/read_simulator/utils/generate_reads.py b/neat/read_simulator/utils/generate_reads.py index 6f5165cd..95f45b77 100644 --- a/neat/read_simulator/utils/generate_reads.py +++ b/neat/read_simulator/utils/generate_reads.py @@ -340,18 +340,17 @@ def generate_reads(reference: SeqRecord, read_name = f'{base_name}_{str(i+1)}' - # Initialize variable to track padding - padding = 0 # If the other read is marked as a singleton, then this one was filtered out, or these are single-ended if not read2_is_singleton: # It's properly paired if it's not a singleton is_paired = not read1_is_singleton - # add a small amount of padding to the end to account for deletions - # 30 is basically arbitrary. This may be a function of read length or determinable? - # Though we don't figure out the variants until later - segment = reference[read1[0]: read1[1] + 30].seq + # add a small amount of padding to the end to account for deletions. + # Trying out this method of using the read-length, which for the default neat run gives ~30. + padding = options.read_len//5 + segment = reference[read1[0]: read1[1] + padding].seq - padding = len(segment) - options.read_len + # if we're at the end of the contig, this may not pick up the full padding + actual_padding = len(segment) - options.read_len read_1 = Read( name=read_name + "/1", @@ -360,8 +359,7 @@ def generate_reads(reference: SeqRecord, reference_id=reference.id, position=read1[0] + ref_start, end_point=read1[1] + ref_start, - padding=padding, - quality_offset=options.quality_offset, + padding=actual_padding, is_paired=is_paired ) @@ -370,23 +368,20 @@ def generate_reads(reference: SeqRecord, handle = fq1_paired else: handle = fq1_single - read_1.finalize_read_and_write(error_model_1, mutation_model, handle, options.produce_fastq) + read_1.finalize_read_and_write( + error_model_1, mutation_model, handle, options.quality_offset, options.produce_fastq + ) # if read1 is a sinleton then these are single-ended reads or this one was filtered out, se we skip if not read1_is_singleton: is_paired = not read2_is_singleton - # Because this segment reads back to front, we need padding at the beginning. - # 30 is arbitrary. This may be a function of read length or determinable? - # Though we don't figure out the variants until later - - # If we had a certain amount of padding from read_1, we'll use that here - if padding: - start_coordinate = max((read2[0] - padding), 0) - else: - start_coordinate = max((read2[0] - 30), 0) + # Padding, as above + padding = options.read_len//5 + start_coordinate = max((read2[0] - padding), 0) # this ensures that we get a segment with NEAT-recognized bases segment = reference[start_coordinate: read2[1]].seq - padding = len(segment) - options.read_len + # See note above + actual_padding = len(segment) - options.read_len read_2 = Read( name=read_name + "/2", @@ -395,8 +390,7 @@ def generate_reads(reference: SeqRecord, reference_id=reference.id, position=read2[0] + ref_start, end_point=read2[1] + ref_start, - padding=padding, - quality_offset=options.quality_offset, + padding=actual_padding, is_reverse=True, is_paired=is_paired ) @@ -406,7 +400,9 @@ def generate_reads(reference: SeqRecord, handle = fq2_paired else: handle = fq2_single - read_2.finalize_read_and_write(error_model_2, mutation_model, handle, options.produce_fastq) + read_2.finalize_read_and_write( + error_model_2, mutation_model, handle, options.quality_offset, options.produce_fastq + ) if properly_paired: properly_paired_reads.append((read_1, read_2)) diff --git a/neat/read_simulator/utils/output_file_writer.py b/neat/read_simulator/utils/output_file_writer.py index 93a87912..365a77ba 100755 --- a/neat/read_simulator/utils/output_file_writer.py +++ b/neat/read_simulator/utils/output_file_writer.py @@ -293,7 +293,6 @@ def output_bam_file(self, reads_files: list, contig_dict: dict, read_length: int and the values the index of that contig :param read_length: the length of the reads for this run """ - # TODO incorporate new read list (no longer a dictionary) from generate_reads pickle file bam_out = bgzf.BgzfWriter(self.bam_fn, 'w', compresslevel=BAM_COMPRESSION_LEVEL) bam_out.write("BAM\1") header = "@HD\tVN:1.4\tSO:coordinate\n" @@ -312,16 +311,26 @@ def output_bam_file(self, reads_files: list, contig_dict: dict, read_length: int bam_out.write(f'{item}\0') bam_out.write(pack(' 1: @@ -150,7 +146,7 @@ def update_quality_array( # Original ref is unaffected, so it's quality score remains the same new_quality_score = [low_score] * (len(alternate) - 1) # If is deletion - elif len(reference) > 1 and len(alternate) == 1: + elif ref_length > 1 and len(alternate) == 1: new_quality_score = [] # SNP else: @@ -160,7 +156,7 @@ def update_quality_array( self.quality_array = \ np.concatenate((self.quality_array[:location], np.array(new_quality_score), - self.quality_array[location+len(reference):])) + self.quality_array[location+ref_length:])) def apply_errors(self, err_model: SequencingErrorModel): """ @@ -169,14 +165,14 @@ def apply_errors(self, err_model: SequencingErrorModel): :param err_model: The error model for this run, :return: None, The sequence, with errors applied """ - mutated_sequence = "" + mutated_sequence = MutableSeq(self.read_sequence) for error in self.errors: # Replace the entire ref sequence with the entire alt sequence mutated_sequence = \ - self.read_sequence[:error.location] + error.alt + self.read_sequence[error.location+len(error.ref):] + mutated_sequence[:error.location] + error.alt + mutated_sequence[error.location+len(error.ref):] # update quality score for error self.update_quality_array( - error.ref, + len(error.ref), error.alt, error.location, "error", @@ -194,7 +190,7 @@ def apply_mutations(self, quality_scores: list, mut_model: MutationModel): :param mut_model: The mutation model for the run, used for the zygosity and for the rng :return: mutated sequence, with mutations applied """ - mutated_sequence = "" + mutated_sequence = MutableSeq(self.read_sequence) for location in self.mutations: variant_to_apply = mut_model.rng.choice(self.mutations[location]) @@ -220,7 +216,7 @@ def apply_mutations(self, quality_scores: list, mut_model: MutationModel): alternate = self.read_sequence[position] else: reference_length = variant_to_apply.get_ref_len() - alternate = self.read_sequence.get_alt() + alternate = variant_to_apply.get_alt() # Replace the entire ref with the entire alt mutated_sequence = \ @@ -229,7 +225,7 @@ def apply_mutations(self, quality_scores: list, mut_model: MutationModel): mutated_sequence[position + reference_length:] self.update_quality_array( - self.reference_segment[location: location+reference_length], + reference_length, alternate, location, "mutation", @@ -237,7 +233,8 @@ def apply_mutations(self, quality_scores: list, mut_model: MutationModel): qual_score ) - return mutated_sequence + # Update the read sequence with the applied mutations + self.read_sequence = Seq(mutated_sequence) def apply_variants_for_final_output( self, @@ -262,8 +259,6 @@ def apply_variants_for_final_output( # The segments we hold off truncating because we need them for alignments, # but the quality array is fine to trim here. self.quality_array = self.quality_array[:self.length] - # Convert back to immutable - self.read_sequence = Seq(self.read_sequence) def contains(self, test_pos: int): return self.position <= test_pos < self.end_point @@ -303,6 +298,7 @@ def finalize_read_and_write( err_model: SequencingErrorModel, mut_model: MutationModel, fastq_handle: TextIO, + quality_offset: int, produce_fastq: bool, ): """ @@ -311,6 +307,7 @@ def finalize_read_and_write( :param err_model: The error model for the run :param mut_model: The mutation model for the run :param fastq_handle: the path to the fastq model to write the read + :param quality_offset: the quality offset for this run :param produce_fastq: If true, this will write out the temp fastqs. If false, this will only write out the tsams to create the bam files. """ @@ -322,18 +319,25 @@ def finalize_read_and_write( # It updates the quality array and reference segment in place, including reversing them, if appropriate self.convert_masking(err_model) - # Get errors for the rea and update the quality score - self.errors, self.padding = err_model.get_sequencing_errors(self.length, self.padding, self.reference_segment, - self.quality_array) - - # Update the read sequence to match the reference + # set the read sequence to match the reference self.read_sequence = self.reference_segment + # Get errors for the rea and update the quality score + self.errors, self.padding = err_model.get_sequencing_errors( + self.length, + self.padding, + self.reference_segment, + self.quality_array + ) + # This applies any variants, updates quality score and read sequence in place self.apply_variants_for_final_output(err_model, mut_model) - self.read_quality_string = "".join([chr(x + self.quality_offset) for x in self.quality_array]) - self.mapping_quality = abs(err_model.rng.poisson(70)) + self.read_quality_string = "".join([chr(x + quality_offset) for x in self.quality_array]) + # If this read isn't low quality, pick a standard mapping quality. + # We could have this be user assigned. + if not self.mapping_quality: + self.mapping_quality = 70 if produce_fastq: fastq_handle.write(f'@{self.name}\n') @@ -352,29 +356,26 @@ def convert_masking(self, error_model: SequencingErrorModel): bad_score = min(error_model.quality_scores) # we'll use generic human repeats, as commonly found in masked regions. We may refine this to make configurable repeat_bases = list("TTAGGG") - modified_segment = "" - modified_quality_array = self.quality_array.copy() if self.is_reverse: - raw_segment = self.reference_segment.reverse_complement() - modified_quality_array = modified_quality_array[::-1] + raw_sequence = self.reference_segment.reverse_complement().upper() + self.quality_array = self.quality_array[::-1] else: - raw_segment = self.reference_segment - - for i in range(len(raw_segment)): - base = raw_segment[i] - if base.islower(): - modified_base = base.upper() - modified_quality_array[i] = bad_score - else: - modified_base = base - if modified_base in ALLOWED_NUCL: - modified_segment += modified_base - else: - modified_segment += repeat_bases[i % 6] - modified_quality_array[i] = bad_score + raw_sequence = self.reference_segment.upper() + + start = raw_sequence.find('N') + if start != -1: + modified_segment = MutableSeq(raw_sequence[:start]) + for i in range(start, len(raw_sequence)): + base = raw_sequence[i] + if base in ALLOWED_NUCL: + modified_segment += base + else: + modified_segment += repeat_bases[i % 6] + self.num_ns += 1 + else: + modified_segment = MutableSeq(raw_sequence) self.reference_segment = Seq(modified_segment) - self.quality_array = modified_quality_array def make_cigar(self): """ @@ -383,18 +384,23 @@ def make_cigar(self): # These parameters were set to minimize breaks in the mutated sequence and find the best # alignment from there. - - # We're ignoring soft-masking (lower case letters) - if self.is_reverse: - read_sequence = self.read_sequence.reverse_complement().upper() - else: - read_sequence = self.read_sequence.upper() - reference_segment = self.reference_segment.upper() raw_alignment = pairwise2.align.localms( - reference_segment, read_sequence, match=1, mismatch=-1, open=-0.5, extend=-0.1, + self.reference_segment, self.read_sequence, match=1, mismatch=-1, open=-0.5, extend=-0.1, penalize_extend_when_opening=True ) + is_n_heavy = self.num_ns > 10 + + # Recall that read_length//5 was what we used for padding. + crap_alignment = False + if raw_alignment[0][1].count('-') > self.length//10: + # Crap alignment + crap_alignment = True + + n_to_crap_correlation = False + if is_n_heavy and crap_alignment: + n_to_crap_correlation = True + alignment = format_alignment(*raw_alignment[0], full_sequences=True).split() aligned_template_seq = alignment[0] aligned_mut_seq = alignment[-2] @@ -403,7 +409,7 @@ def make_cigar(self): curr_char = '' cig_string = '' # Find first match - for char in range(len(read_sequence)): + for char in range(len(self.read_sequence)): if aligned_template_seq[char] == '-': # insertion if curr_char == 'I': # more insertions cig_count += 1 @@ -417,7 +423,8 @@ def make_cigar(self): if curr_char == 'D': # more deletions cig_count += 1 else: # new deletion - cig_string = cig_string + str(cig_count) + curr_char + if cig_count != 0: + cig_string = cig_string + str(cig_count) + curr_char curr_char = 'D' cig_count = 1 else: # match @@ -427,7 +434,7 @@ def make_cigar(self): else: # new match # If there is anything before this, add it to the string and increment, # else, just increment - if not cig_count == 0: + if cig_count != 0: cig_string = cig_string + str(cig_count) + curr_char curr_char = 'M' cig_count = 1 @@ -436,9 +443,11 @@ def make_cigar(self): break if cig_length < self.length: - raise ValueError("Problem creating cigar string") + # testing a theory + pass + # raise ValueError("Problem creating cigar string") # append the final section as we return - return cig_string + str(cig_count) + curr_char + return cig_string + str(cig_count) + curr_char, n_to_crap_correlation, crap_alignment def get_mpos(self): """ @@ -467,3 +476,8 @@ def get_tlen(self): return length else: return 0 + + def align(self): + """ + Because of the way we have set up the segments, it should be easy to align these. + """ From 5250f56e41046bf003e480db58ff24021264dba6 Mon Sep 17 00:00:00 2001 From: Joshua Allen Date: Sat, 25 May 2024 03:12:14 -0500 Subject: [PATCH 8/8] It's passing my tests and I think it's ready to merge te develop --- neat/common/logging.py | 2 ++ neat/read_simulator/runner.py | 6 ++-- neat/read_simulator/utils/generate_reads.py | 6 ++-- .../utils/output_file_writer.py | 32 +++++++++++-------- 4 files changed, 26 insertions(+), 20 deletions(-) diff --git a/neat/common/logging.py b/neat/common/logging.py index eab3b512..92de3d37 100644 --- a/neat/common/logging.py +++ b/neat/common/logging.py @@ -57,4 +57,6 @@ def setup_logging( logging.basicConfig(**kwargs) + logging.getLogger(__name__).info(f"writing log to: {log_file}") + diff --git a/neat/read_simulator/runner.py b/neat/read_simulator/runner.py index 7b408944..7645f211 100644 --- a/neat/read_simulator/runner.py +++ b/neat/read_simulator/runner.py @@ -9,8 +9,8 @@ from Bio import SeqIO from pathlib import Path -from .utils import Options, parse_input_vcf, parse_beds, fill_out_bed_dict, OutputFileWriter, find_file_breaks, \ - map_chromosome, generate_variants, write_local_file, generate_reads +from .utils import Options, parse_input_vcf, parse_beds, OutputFileWriter, find_file_breaks, \ + generate_variants, write_local_file, generate_reads from ..common import validate_input_path, validate_output_path from ..models import MutationModel, SequencingErrorModel, FragmentLengthModel, GcModel from ..models.default_cancer_mutation_model import * @@ -45,7 +45,7 @@ def initialize_all_models(options: Options): cancer_model = None if options.cancer and options.cancer_model: - cancer_model = pickle.load(gzip.open(options.cancer_model)) + # cancer_model = pickle.load(gzip.open(options.cancer_model)) # Set the rng for the cancer mutation model cancer_model.rng = options.rng elif options.cancer: diff --git a/neat/read_simulator/utils/generate_reads.py b/neat/read_simulator/utils/generate_reads.py index 95f45b77..09236eb7 100644 --- a/neat/read_simulator/utils/generate_reads.py +++ b/neat/read_simulator/utils/generate_reads.py @@ -6,13 +6,11 @@ from math import ceil from pathlib import Path from Bio import SeqRecord -from Bio.Seq import Seq -from numpy.random import Generator from bisect import bisect_left, bisect_right from ...models import SequencingErrorModel, GcModel, FragmentLengthModel, MutationModel from .options import Options -from ...common import open_input, open_output, ALLOWED_NUCL +from ...common import open_output from ...variants import ContigVariants from .read import Read @@ -412,7 +410,7 @@ def generate_reads(reference: SeqRecord, else: singletons.append((None, read_2)) - _LOG.debug(f"Contig fastq(s) written in: {(time.process_time() - t)/60:.2f} m") + _LOG.info(f"Contig fastq(s) written in: {(time.process_time() - t)/60:.2f} m") if options.produce_bam: # this will give us the proper read order of the elements, for the sam. They are easier to sort now diff --git a/neat/read_simulator/utils/output_file_writer.py b/neat/read_simulator/utils/output_file_writer.py index 365a77ba..b7b9aade 100755 --- a/neat/read_simulator/utils/output_file_writer.py +++ b/neat/read_simulator/utils/output_file_writer.py @@ -8,24 +8,19 @@ import gzip import re -import shutil +import time from struct import pack import logging -import pysam -import numpy as np import pickle from Bio import bgzf from Bio import SeqIO -from Bio.Seq import Seq, MutableSeq -from typing import Iterator, TextIO from pathlib import Path from numpy.random import Generator from ...common import validate_output_path, open_output, open_input, ALLOWED_NUCL from .read import Read from .options import Options -from .neat_cigar import CigarString _LOG = logging.getLogger(__name__) @@ -203,6 +198,7 @@ def merge_temp_fastqs( paired_files = [] singleton_files = [] + t = time.process_time() # First split the files into 2 categories. For paired ended reads, most reads will be paired, though some may # have gotten filtered out with input files. We will append any singletons in paired ended mode at the end, @@ -230,10 +226,10 @@ def merge_temp_fastqs( for file_pair in singleton_files: if file_pair[0]: file_index = SeqIO.index(str(file_pair[0]), 'fastq') - contig_name = Path(file_pair[0]).name.strip('_r1_single.fq.bgz') + contig_name = Path(file_pair[0]).name.removesuffix('_r1_single.fq.bgz') elif file_pair[1]: file_index = SeqIO.index(str(file_pair[1]), 'fastq') - contig_name = Path(file_pair[1]).name.strip('_r2_single.fq.bgz') + contig_name = Path(file_pair[1]).name.removesuffix('_r2_single.fq.bgz') else: # So singletons for this contig, so move on continue @@ -259,10 +255,14 @@ def merge_temp_fastqs( open_output(self.fastq2_fn) as fq2 ): # First we add all properly paired reads - for i in range(len(shuffled_paired_keys)): + num_reads = len(shuffled_paired_keys) + for i in range(num_reads): + print(f'{i/num_reads:.2%}', end='\r') current_key = shuffled_paired_keys[i] # reconstruct tho chromosome name - chrom_name = current_key[0].removeprefix("NEAT-generated_").split('_')[0] + chrom_name_with_rdnm = current_key[0].removeprefix("NEAT-generated_").split('/')[0] + suffix = re.findall(r"_\d*$", chrom_name_with_rdnm)[0] + chrom_name = chrom_name_with_rdnm.removesuffix(suffix) # 1 here because this is read1 read1 = fastq_index_dict[chrom_name][1][current_key[0]] SeqIO.write(read1, fq1, 'fastq') @@ -275,13 +275,16 @@ def merge_temp_fastqs( # Next we add the strays (or all reads, for single-ended) for j in range(len(shuffled_singleton_keys)): current_key = shuffled_singleton_keys[j] - chrom_name = current_key[0].removeprefix("NEAT_generated_").split('_')[0] - read = fastq_index_dict[chrom_name][1][current_key[0]] + chrom_name_with_rdnm = current_key.removeprefix("NEAT-generated_").split('/')[0] + suffix = re.findall(r"_\d*$", chrom_name_with_rdnm)[0] + chrom_name = chrom_name_with_rdnm.removesuffix(suffix) + read = fastq_index_dict[chrom_name][1][current_key] SeqIO.write(read, fq1, 'fastq') if not wrote_r2: fastq2_path = Path(self.fastq2_fn) fastq2_path.unlink() + _LOG.info(f"Fastq(s) written in {(time.process_time() - t)/60:.2f} m") def output_bam_file(self, reads_files: list, contig_dict: dict, read_length: int): """ @@ -293,6 +296,7 @@ def output_bam_file(self, reads_files: list, contig_dict: dict, read_length: int and the values the index of that contig :param read_length: the length of the reads for this run """ + t = time.process_time() bam_out = bgzf.BgzfWriter(self.bam_fn, 'w', compresslevel=BAM_COMPRESSION_LEVEL) bam_out.write("BAM\1") header = "@HD\tVN:1.4\tSO:coordinate\n" @@ -330,9 +334,11 @@ def output_bam_file(self, reads_files: list, contig_dict: dict, read_length: int elif is_crap: total_crap += 1 if total_crap: - print(f"n-to-crap correlation: {n_to_crap_correlation/total_crap:.2%}") + _LOG.info(f"n-to-crap correlation: {n_to_crap_correlation/total_crap:.2%}") bam_out.close() + _LOG.info(f"Bam written in: {(time.process_time() - t)/60:.2f} m") + def write_bam_record(self, read: Read, contig_id: int, bam_handle: bgzf.BgzfWriter, read_length: int): """ Takes a read object and writes it out as a bam record