From d389ada333b6411e101583f77fc8d6f60e2cbd90 Mon Sep 17 00:00:00 2001
From: Joshua Allen <jallen17@illinois.edu>
Date: Mon, 13 May 2024 22:18:52 -0500
Subject: [PATCH 1/8] Making progress simplifying and strengthening the
 generate reads section

---
 neat/models/default_fraglen_model.py          |   8 -
 neat/models/models.py                         |  42 +-
 neat/read_simulator/runner.py                 |  17 +-
 neat/read_simulator/utils/generate_reads.py   | 380 +++++-------------
 neat/read_simulator/utils/options.py          |  13 +-
 ...over_dataset.py => test_generate_reads.py} |  63 +--
 6 files changed, 151 insertions(+), 372 deletions(-)
 delete mode 100644 neat/models/default_fraglen_model.py
 rename tests/test_read_simulator/{test_cover_dataset.py => test_generate_reads.py} (73%)

diff --git a/neat/models/default_fraglen_model.py b/neat/models/default_fraglen_model.py
deleted file mode 100644
index 5b3ef168..00000000
--- a/neat/models/default_fraglen_model.py
+++ /dev/null
@@ -1,8 +0,0 @@
-"""
-The following model is the default for the fragment length distribution.
-"""
-
-default_fragment_mean = 601.0
-default_fragment_std = 103.0
-default_fragment_max = 1626.0
-default_fragment_min = 3.0
\ No newline at end of file
diff --git a/neat/models/models.py b/neat/models/models.py
index f2a26b0b..fef6f270 100644
--- a/neat/models/models.py
+++ b/neat/models/models.py
@@ -21,7 +21,6 @@
 from .default_mutation_model import *
 from .default_sequencing_error_model import *
 from .default_gc_bias_model import *
-from .default_fraglen_model import *
 from .utils import bin_scores, take_closest
 
 __all__ = [
@@ -600,53 +599,28 @@ class FragmentLengthModel:
 
     :param fragment_mean: the mean of the collection of fragment lengths derived from data
     :param fragment_std: the standard deviation of the collection of fragment lengths derived from data
-    :param fragment_max: the largest fragment observed in the data
-    :param fragment_min: the smallest fragment observed in data
     :param rng: the random number generator for the run
     """
 
     def __init__(self,
-                 fragment_mean: float = None,
-                 fragment_std: float = None,
-                 fragment_max: int = None,
-                 fragment_min: int = None,
+                 fragment_mean: float,
+                 fragment_std: float,
                  rng: Generator = None):
-        self.fragment_mean = fragment_mean if fragment_mean else default_fragment_mean
-        self.fragment_st_dev = fragment_std if fragment_std else default_fragment_std
-        self.fragment_max = fragment_max if fragment_max else default_fragment_max
-        self.fragment_min = fragment_min if fragment_min else default_fragment_min
+        self.fragment_mean = fragment_mean
+        self.fragment_st_dev = fragment_std
         self.rng = rng
 
     def generate_fragments(self,
                            total_length: int,
-                           read_length: int,
-                           coverage: int) -> list:
+                           number_of_fragments: int) -> list:
         """
         Generates a number of fragments based on the total length needed, and the mean and standard deviation of the set
 
         :param total_length: Length of the reference segment we are covering.
-        :param read_length: average length of the reads
-        :param coverage: the target coverage number
+        :param number_of_fragments: The number of fragments needed.
         :return: A list of fragment random fragment lengths sampled from the model.
         """
-        # Estimate the number of fragments needed (with a 2x padding)
-        number_of_fragments = int(round(total_length / read_length) * (coverage * 2))
-        # Check that we don't have unusable values for fragment mean. Too many fragments under the read length means
-        # NEAT will either get caught in an infinite cycle of sampling fragments but never finding one that works, or
-        # it will only find a few and will run very slowly.
-        if self.fragment_mean < read_length:
-            # Let's just reset the fragment mean to make up for this.
-            self.fragment_mean = read_length
         # generates a distribution, assuming normality, then rounds the result and converts to ints
         dist = np.round(self.rng.normal(self.fragment_mean, self.fragment_st_dev, size=number_of_fragments)).astype(int)
-        # filter the list to throw out outliers and to set anything under the read length to the read length.
-        dist = [max(x, read_length) for x in dist if x <= self.fragment_max]
-        # Just a sanity check to make sure our data isn't too thin:
-        while number_of_fragments - len(dist) > 0:
-            additional_fragment = self.rng.normal(loc=self.fragment_mean, scale=self.fragment_st_dev)
-            if additional_fragment < read_length:
-                continue
-            dist.append(round(additional_fragment))
-
-        # Now set a minimum on the dataset. Any fragment smaller than read_length gets turned into read_length
-        return dist
+
+        return list(dist)
diff --git a/neat/read_simulator/runner.py b/neat/read_simulator/runner.py
index 43d0836e..2e9f4965 100644
--- a/neat/read_simulator/runner.py
+++ b/neat/read_simulator/runner.py
@@ -98,15 +98,16 @@ def initialize_all_models(options: Options):
 
     _LOG.debug('GC Bias model loaded')
 
-    fraglen_model = None
-    if options.paired_ended:
-        if options.fragment_model:
-            fraglen_model = pickle.load(gzip.open(options.fragment_model))
-        else:
-            fraglen_model = FragmentLengthModel(options.fragment_mean, options.fragment_st_dev)
-
-        # Set the rng for the fragment length model
+    if options.fragment_model:
+        fraglen_model = pickle.load(gzip.open(options.fragment_model))
         fraglen_model.rng = options.rng
+    elif options.fragment_mean:
+        fraglen_model = FragmentLengthModel(options.fragment_mean, options.fragment_st_dev, rng=options.rng)
+    else:
+        # For single ended, fragment length will be based on read length
+        fragment_mean = options.read_len * 1.5
+        fragment_st_dev = fragment_mean * 0.2
+        fraglen_model = FragmentLengthModel(fragment_mean, fragment_st_dev, options.rng)
 
     _LOG.debug("Fragment length model loaded")
 
diff --git a/neat/read_simulator/utils/generate_reads.py b/neat/read_simulator/utils/generate_reads.py
index 96b3c8a4..7a4bc699 100644
--- a/neat/read_simulator/utils/generate_reads.py
+++ b/neat/read_simulator/utils/generate_reads.py
@@ -3,7 +3,7 @@
 import pickle
 import numpy as np
 
-from math import ceil, floor
+from math import ceil
 from pathlib import Path
 from Bio import SeqRecord
 from Bio.Seq import Seq
@@ -17,7 +17,8 @@
 from .read import Read
 
 __all__ = [
-    'generate_reads'
+    'generate_reads',
+    'cover_dataset',
 ]
 
 _LOG = logging.getLogger(__name__)
@@ -36,15 +37,18 @@ def is_too_many_n(segment):
     return n / len(segment) >= 0.2
 
 
+"""
+Currently this code block is unused, but I think it might be useful for parallelization, so I wanted to leave it in.
+
 def create_windows(sequence: SeqRecord, size: int, overlap: int):
-    """
-    Create a list of windows. We might need this for parallelization, so I'm leaving the code in place
+    \"""
+    Create a list of windows. Currently Unused. We might need this for parallelization, so I'm leaving the code in place
 
     :param sequence: Sequence to split
     :param size: size of windows
     :param overlap: size of overlap between windows
     :return: list of windows
-    """
+    \"""
     windows = []
     for i in range(0, len(sequence), size):
         if i < overlap and i + size + overlap < len(sequence):
@@ -55,13 +59,12 @@ def create_windows(sequence: SeqRecord, size: int, overlap: int):
             windows.append((i, len(sequence)))
 
     return windows
+"""
 
 
 def cover_dataset(
         span_length: int,
-        target_vector: np.ndarray,
         options: Options,
-        gc_bias_model: GcModel,
         fragment_model: FragmentLengthModel | None,
 ) -> list:
     """
@@ -69,211 +72,79 @@ def cover_dataset(
     to the proper coverage depth. It uses an abstract representation of the reads, by end points.
 
     :param span_length: The total length the cover needs to span
-    :param target_vector: The vector of coverage targets by position
     :param options: The options for the run
-    :param gc_bias_model: The gc-bias model being used for this run.
     :param fragment_model: The fragment model used for to generate random fragment lengths
     """
 
-    # Variables to track reads and current coverage
-    temp_reads = []
-    final_coverage = [0] * len(target_vector)
-    final_reads = []
-
-    # Determine potential fragment lengths
-    if fragment_model:
-        # Paired ended case
-        fragment_pool = fragment_model.generate_fragments(span_length, options.read_len, options.coverage)
-    else:
-        # single ended case
-        fragment_pool = [options.read_len]
-
-    # We need a window to cover. If window size of the gc model is greater than read length, then we'll call it 1,
-    # meaning 1 section of the target vector.
-    read_window_width = max(max(fragment_pool)//gc_bias_model.window_size, 1)
-
-    # A quarter of a read length
-    quarter_read = options.read_len//4
-    # for each position, the set of reads covering that position.
-    for i in range(len(target_vector)-read_window_width):
-        section_id = (i, i+read_window_width)
-        subsection_target_vector = target_vector[i: i+read_window_width]
-        subsection_target = ceil(np.mean(subsection_target_vector))
-
-        # Check if we even need to bother this section
-        section_max = max(final_coverage[i: i+read_window_width])
-        if section_max >= subsection_target:
-            # we have enough reads here, move on.
-            i += options.rng.choice(list(range(quarter_read)))
-            continue
-
-        # We need to know where we are along the reference contig. We are i * read_window_widths in,
-        # and each read_window is gc_bias_model.window_size long.
-        start_coordinate = i * read_window_width * gc_bias_model.window_size
-        # read_count counts the reads added to this section
-        read_count = 0
-        # 1.5x as many as needed to give the final set a bit of randomness.
-        temp_pool_size = ceil(subsection_target * 1.5)
-        # debug variable
-        k = 0
-        while read_count < temp_pool_size:
-            # The structure for these reads will be (left_start, left_end, right_start, right_end)
-            # where start and end are ints with end > start. Reads can overlap, so right_start < left_end
-            # is possible, but the reads cannot extend past each other, so right_start < left_start and
-            # left_end > right_end are not possible.
-            if options.paired_ended:
-                left_read_length = right_read_length = options.read_len
-                # Select a random fragment length from the pool.
-                fragment_length = options.rng.choice(fragment_pool)
-            else:
-                left_read_length = options.read_len
-                right_read_length = 0
-                fragment_length = left_read_length
-
-            # Booleans telling us how to deal with the various exceptions below.
-            skip_left = False
-            skip_right = False
-
-            left_start = start_coordinate
-            # This trims the left read to the region of interest
-            left_end = min(span_length, start_coordinate + left_read_length)
-
-            # Make right read, if paired ended, else set it equal to the default
-            if options.paired_ended:
-                # The candidate end points of the paired right read, uncorrected. It starts from the far end, so we
-                # know the end based on the fragment length and we count in from there.
-                uncorrected_right_end = start_coordinate + fragment_length
-                uncorrected_right_start = uncorrected_right_end - right_read_length
-
-                # this trims the raw right read, so that right_start >= left_start and left_end <= right_end
-                # this keeps us from getting unrealistic paired ended reads
-                right_start = max(uncorrected_right_start, left_start)
-                right_end = min(max(uncorrected_right_end, left_end), span_length)
-
-                # Both reads are off the map, so we throw this out:
-                if right_end <= 0:
-                    # Move the start coordinate slightly to see if this keeps things moving.
-                    start_coordinate = min(
-                        max(start_coordinate + options.rng.integers(-10, 10), 0),
-                        span_length - fragment_length - 1
-                    )
-                    k += 1
-                    if k >= 1000:
-                        _LOG.debug(f"skipped {k} times")
-                    continue
-
-            else:
-                skip_right = True
-                right_start = 0
-                right_end = 0
-
-            # Cases that might make us want to skip these reads:
-
-            # Case 1: Truncated left read, unusable
-            if left_end - left_start < left_read_length:
-                # right read is usable if it's not off the map and is at least a read length long
-                if right_start < span_length and \
-                        (right_end - right_start >= right_read_length):
-                    # skip the left read, keep the right read, indicating an unpaired right read
-                    skip_left = True
-                # else, skip both
-                else:
-                    skip_left = skip_right = True
-
-            # Case 2: Truncated right read, unusable
-            if right_end - right_start < right_read_length:
-                # left read is usable if it's not off the map and has at least half a
-                # read length of usable bases
-                if left_start < span_length and \
-                        (left_end - left_start >= left_read_length):
-                    # Set the right read to (0, 0), indicating an unpaired left read
-                    skip_right = True
-                # else, skip both
+    final_reads = set()
+    # precompute how many reads we want
+    # The numerator is the total number of base pair calls needed.
+    # Divide that by read length gives the number of reads needed
+    number_reads = ceil((span_length * options.coverage) / options.read_len)
+
+    # We use fragments to model the DNA
+    fragment_pool = fragment_model.generate_fragments(span_length, number_reads * 3)
+
+    # step 1: Divide the span up into segments drawn froam the fragment pool. Assign reads based on that.
+    # step 2: repeat above until number of reads exceeds number_reads * 1.5
+    # step 3: shuffle pool, then draw number_reads (or number_reads/2 for paired ended) reads to be our reads
+    read_count = 0
+    while read_count <= number_reads:
+        start = 0
+        temp_fragments = []
+        # Breaking the gename into fragments
+        while start < span_length:
+            # We take the first element and put it back on the end to create an endless pool of fragments to draw from
+            fragment = fragment_pool.pop(0)
+            end = min(start + fragment, span_length)
+            # these are equivalent of reads we expect the machine to filter out, but we won't actually use it
+            if end - start < options.read_len:
+                # add some random flavor to try to keep it to falling into a loop
+                if options.rng.normal() < 0.5:
+                    fragment_pool.insert(fragment, len(fragment_pool)//2)
                 else:
-                    skip_left = skip_right = True
-
-            if skip_left and skip_right:
-                # If neither read is usable, we'll just move to the next
-                # Move the start coordinate slightly to see if this keeps things moving.
-                start_coordinate = min(
-                    max(start_coordinate + options.rng.integers(-10, 10), 0),
-                    span_length - fragment_length - 1
-                )
-                k += 1
-                if k >= 1000:
-                    _LOG.debug(f"skipped {k} times")
-                continue
-
-            if not skip_left:
-                temp_left_read = (left_start, left_end)
+                    fragment_pool.insert(fragment, len(fragment_pool) - 3)
             else:
-                temp_left_read = (0, 0)
-
-            if not skip_right:
-                temp_right_read = (right_start, right_end)
+                fragment_pool.append(fragment)
+                temp_fragments.append((start, end))
+            start = end
+
+        # Generating reads from fragments
+        for fragment in temp_fragments:
+            read_start = fragment[0]
+            read_end = read_start + options.read_len
+            # This filters out those small fragments, to give the dataset some realistic variety
+            if read_end > fragment[1]:
+                continue
             else:
-                temp_right_read = (0, 0)
-
-            read_to_add = temp_left_read + temp_right_read
-            temp_reads.append(read_to_add)
-            read_count += 1
-            # bump it a bit to give it a little variety.
-            start_coordinate = min(
-                max(start_coordinate + options.rng.integers(-10, 10), 0),
-                span_length - fragment_length - 1
-            )
-
-        # Subset this set of temporary reads to make sure we don't overdo it.
-        reads_to_add, final_coverage = final_subsetting(
-            candidate_reads=temp_reads,
-            target=subsection_target,
-            coverage_vector=final_coverage,
-            coordinates=section_id,
-            rng=options.rng
-        )
-        final_reads.extend(reads_to_add)
-
-    return final_reads
-
-
-def final_subsetting(
-        candidate_reads: list,
-        target: int,
-        coverage_vector: list,
-        coordinates: tuple,
-        rng: Generator
-) -> (list, list):
-    """
-    The final subsetting for reads. We basically just calculate if we need to subset, then how many final reads we
-    should have, and we draw them randomly without replacement until we have enough.
-
-    :param candidate_reads: The list of reads in the following format: (left_start, left_end, right_start, right_end)
-    :param target: The target coverage value for this section
-    :param coverage_vector: The coverage vector, will be updated with new counts.
-    :param coordinates: The indices of interest.
-    :param rng: The random number generator for the run
-    :return: Two lists, left and right reads, culled to proper coverage depth.
-    """
-
-    # Throw out fully empty reads.
-    filtered_candidates = []
-    for read in candidate_reads:
-        if not any(read):
-            continue
-        elif read.count('N') > len(read)/2:
-            continue
-        else:
-            filtered_candidates.append(read)
-
-    # if we're under target, we want to make up the difference.
-    # grab however many we need.
-    ret_set = rng.choice(filtered_candidates, size=target, replace=False)
-
-    # Update the coverage vector
-    for i in range(coordinates[0], coordinates[1]):
-        coverage_vector[i] += target
-
-    return ret_set, coverage_vector
+                read1 = (read_start, read_end)
+                if options.paired_ended:
+                    # This will be valid because of the check above
+                    read2 = (fragment[1] - options.read_len, fragment[1])
+                else:
+                    read2 = (0, 0)
+                # The structure for these reads will be (left_start, left_end, right_start, right_end)
+                # where start and end are ints with end > start. Reads can overlap, so right_start < left_end
+                # is possible, but the reads cannot extend past each other, so right_start < left_start and
+                # left_end > right_end are not possible.
+                read = read1 + read2
+                if read not in final_reads:
+                    final_reads.add(read)
+                    read_count += 1
+
+    # Convert set to final list
+    final_reads = list(final_reads)
+    # Now we shuffle them to add some randomness
+    options.rng.shuffle(final_reads)
+    # And only return the number we needed
+    if options.paired_ended:
+        # Since each read is actually 2 reads, we only need to return half as many. But to cheat a few extra, we scale
+        # that down slightly to 1.85 reads per read. This factor is arbitrary and may even be a function. But let's see
+        # how well this estimate works
+        return final_reads[:ceil(number_reads/1.85)]
+    else:
+        # Each read lacks a pair, so we need the full number of single ended reads
+        return final_reads[:number_reads]
 
 
 def find_applicable_mutations(my_read: Read, all_variants: ContigVariants) -> dict:
@@ -313,48 +184,6 @@ def replace_n(segment: Seq, rng: Generator) -> Seq:
     return Seq(modified_segment)
 
 
-def modify_target_coverage(
-        target_regions: list,
-        discard_regions: list,
-        coverage_vector: np.ndarray,
-        coverage: int,
-        target_shape: int):
-    """
-    Modifies the coverage vector by applying the list of regions. For this version, areas
-    outside the regions have coverage adjusted by the off_target_percent
-
-    :param target_regions: A list of intervals to target, extracted from a bed file
-    :param discard_regions: A list of regions to throw out, extracted from a bed file
-    :param coverage_vector: The target coverage vector, which will be modified
-    :param coverage: The target coverage value at this position
-    :param target_shape: The size of the bins in the final output
-    :return: The updated target coverage vector, binned into target_shape sized bins.
-    """
-
-    for region in target_regions:
-        # in the included regions, any are marked false is to be excluded. Everything else remains untouched
-        if not region[2]:
-            coverage_vector[region[0]: region[1]] = [0] * (region[1] - region[0])
-
-    for region in discard_regions:
-        # in the discard regions section, a True indicates this region falls
-        # within a discard bed and should be discarded. Note that this will discard regions if the discard and target
-        # beds overlap
-        if region[2]:
-            coverage_vector[region[0]: region[1]] = [0] * (region[1] - region[0])
-
-    # now for the final coverage vector, we need to bin by the window bias in the gc-bias model
-    start = 0
-    final_coverage_vector = []
-    for i in range(target_shape, len(coverage_vector), target_shape):
-        subsection = coverage_vector[start: i]
-        subsection_average = round(sum(subsection)/len(subsection))  # round() with no second argument returns an int
-        final_coverage_vector.append(subsection_average)
-        start = i
-
-    return final_coverage_vector
-
-
 def merge_sort(my_array: np.ndarray):
     """
     This sorts the reads in reverse position, merging as it goes, in order to get the proper order
@@ -411,60 +240,30 @@ def generate_reads(reference: SeqRecord,
     start = time.time()
 
     base_name = f'{Path(options.output).name}-{chrom}'
-    # Assume uniform coverage
-    target_coverage_vector = np.full(shape=len(reference), fill_value=options.coverage)
-    if not options.no_coverage_bias:
-        pass
-        # I'm trying to move this section into cover_dataset.
-        # target_coverage_vector = gc_bias.create_coverage_bias_vector(target_coverage_vector, reference.seq.upper())
-
-    # We want to bin the coverage into window-sized segments to speed up calculations.
-    # This divides the segment into len(reference) // window_size (integer division).
-    target_shape = len(reference) // gc_bias.window_size
-
-    # Apply the targeting/discarded rates.
-    target_coverage_vector = modify_target_coverage(
-        targeted_regions, discarded_regions, target_coverage_vector, options.coverage, target_shape
-    )
 
     _LOG.debug("Covering dataset.")
     t = time.process_time()
     reads = cover_dataset(
         len(reference),
-        target_coverage_vector,
         options,
-        gc_bias,
         fraglen_model,
     )
-    _LOG.debug(f"Data set coverage took: {time.process_time() - t}")
-
-    # Reads that are paired
-    paired_reads = np.asarray([tuple(x) for x in reads if any(x[0:2]) and any(x[2:4])])
-    if paired_reads.size:
-        paired_reads = merge_sort(paired_reads)
+    _LOG.debug(f"Dataset coverage took: {time.process_time() - t}")
 
-    # singletons
-    if paired_reads.any():
-        singletons = np.asarray([tuple(x) for x in reads if x not in paired_reads and any(x)])
-    else:
-        singletons = np.asarray([tuple(x) for x in reads if any(x)])
-    if singletons.size:
-        singletons = merge_sort(singletons)
-
-    # determine sam read order. It should be paired reads, then singletons, unless one or the other is missing.
-    if paired_reads.size and singletons.size:
-        sam_read_order = np.concatenate((paired_reads, singletons))
-    elif paired_reads.size:
-        # if singletons is empty
-        sam_read_order = paired_reads
-    else:
-        # if there are no paired reads
-        sam_read_order = singletons
+    # Filters left to apply: N filter... not sure yet what to do with those (output random low quality reads, maybe)
+    # Target regions
+    # discard regions
+    # GC-bias figure out some way to throw out 75% of reads with high GC or AT content (see original gc bias model).
+    # heterozygous variants. Okay, so I've been applying the variant to all but for heterozygous we need to apply the
+    # variant to only some
 
+    paired_reads = []
+    singletons = []
     final_sam_dict = {}
-
     if options.paired_ended:
-        _LOG.debug(f"Paired percentage = {len(paired_reads)/len(sam_read_order)}")
+        # This is the orderd the reads will be in when sorted in the final sam file, if we're doing that. They are
+        # easier to sort now as sets of coordinates than they will be later as lines in a file.
+        sam_dict_order = sorted(reads, key=lambda element: (element[0:]))
 
     _LOG.debug("Writing fastq(s) and optional tsam, if indicated")
     t = time.process_time()
@@ -561,5 +360,8 @@ def generate_reads(reference: SeqRecord,
         with open_output(reads_pickle) as reads:
             pickle.dump(final_sam_dict, reads)
 
+    if options.paired_ended:
+        _LOG.debug(f"Paired percentage = {len(paired_reads)/len(sam_read_order)}")
+
     _LOG.info(f"Finished sampling reads in {time.time() - start} seconds")
     return chrom_fastq_r1, chrom_fastq_r2
diff --git a/neat/read_simulator/utils/options.py b/neat/read_simulator/utils/options.py
index de9cab34..8ad67f12 100644
--- a/neat/read_simulator/utils/options.py
+++ b/neat/read_simulator/utils/options.py
@@ -299,18 +299,23 @@ def log_configuration(self):
             _LOG.info(f"Single threading - 1 thread.")
             # We'll work on multithreading later...
             # _LOG.info(f'Multithreading - {options.threads} threads')
+        if self.fragment_mean:
+            if self.fragment_st_dev:
+                _LOG.info(f'Generating fragments based on mean={self.fragment_mean}, '
+                          f'stand. dev={self.fragment_st_dev}')
+            else:
+                raise ValueError("Provided fragment mean, but no fragment standard deviation!")
         if self.paired_ended:
             _LOG.info(f'Running in paired-ended mode.')
             if self.fragment_model:
                 _LOG.info(f'Using fragment length model: {self.fragment_model}')
-            elif self.fragment_mean and self.fragment_st_dev:
-                _LOG.info(f'Generating fragment model based on mean={self.fragment_mean}, '
-                          f'st dev={self.fragment_st_dev}')
+            elif self.fragment_mean:
+                pass  # Already addressed this above
             else:
                 raise ValueError("Paired ended mode requires either a fragment model or a mean and standard deviation.")
-
         else:
             _LOG.info(f'Running in single-ended mode.')
+
         _LOG.info(f'Using a read length of {self.read_len}')
         _LOG.info(f'Average coverage: {self.coverage}')
         if self.error_model:
diff --git a/tests/test_read_simulator/test_cover_dataset.py b/tests/test_read_simulator/test_generate_reads.py
similarity index 73%
rename from tests/test_read_simulator/test_cover_dataset.py
rename to tests/test_read_simulator/test_generate_reads.py
index c8193c1d..cbe1f996 100644
--- a/tests/test_read_simulator/test_cover_dataset.py
+++ b/tests/test_read_simulator/test_generate_reads.py
@@ -1,65 +1,69 @@
+from math import ceil
+
 import pytest
 import numpy as np
 
-from neat.models import FragmentLengthModel
-from neat.read_simulator.utils import Options, cover_dataset
+from neat.models import FragmentLengthModel, GcModel
+from neat.read_simulator.utils import Options
+from neat.read_simulator.utils.generate_reads import *
 
 
 def test_cover_dataset():
     """Test that a cover is successfully generated for different coverage values"""
-    read_pool = [10] * 2000
     span_length = 100
-    target_vector = np.full(100, fill_value=10, dtype=int)
     options = Options(rng_seed=0)
-    options.read_len = 101
-    options.paired_ended = True
-    options.fragment_mean = 250
-    options.fragment_st_dev = 100
-    options.output.overwrite_output = True
-    fragment_model = FragmentLengthModel(rng=options.rng)
+    options.read_len = 10
+    options.paired_ended = False
+    options.overwrite_output = True
+    fragment_model = FragmentLengthModel(15, 5, rng=options.rng)
 
-    coverage_values = [1, 2, 5, 10, 25, 50]
+    coverage_values = [1, 2, 5]
     for coverage in coverage_values:
         options.coverage = coverage
-        read1, read2 = cover_dataset(read_pool, span_length, target_vector, options, fragment_model)
+        reads = cover_dataset(span_length, options, fragment_model)
         coverage_check = []
         for i in range(span_length):
             # single ended test, only need read1
-            cover = [x for x in read1 if i in range(x[0], x[1])]
+            cover = [x for x in reads if i in range(x[0], x[1])]
             coverage_check.append(len(cover))
-        assert sum(coverage_check)/len(coverage_check) > coverage, f"Coverage check failed for coverage {coverage}"
+        print(f"\nRequested Coverage: {coverage}, Average Coverage: {sum(coverage_check) / len(coverage_check)}")
+        assert sum(coverage_check)/len(coverage_check) > 0.9 * coverage, f"Coverage check failed for coverage {coverage}"
+        for read in reads:
+            if read[1] - read[0] < 10:
+                raise AssertionError("failed to filter out a small read length")
+        assert len(reads) >= (100 * coverage)/10
 
 
 def test_paired_cover_dataset():
     """Test that a cover is successfully generated for different coverage values"""
-    read_pool = [10] * 2000
     span_length = 100
-    target_vector = np.full(100, fill_value=10, dtype=int)
     options = Options(rng_seed=0)
-    options.read_len = 101
+    options.read_len = 10
     options.paired_ended = True
-    options.fragment_mean = 250
-    options.fragment_st_dev = 100
-    options.output.overwrite_output = True
-    fragment_model = FragmentLengthModel(fragment_mean=20, fragment_std=2, fragment_min=10, fragment_max=30, rng=options.rng)
+    options.overwrite_output = True
+    fragment_model = FragmentLengthModel(25, 10, rng=options.rng)
 
-    coverage_values = [1, 2, 5, 10, 25, 50]
+    coverage_values = [1, 2, 5]
     for coverage in coverage_values:
         options.coverage = coverage
-        read1, read2 = cover_dataset(read_pool, span_length, target_vector, options, fragment_model)
+        reads = cover_dataset(span_length, options, fragment_model)
         coverage_check = []
         for i in range(span_length):
             # paired ended test, need both read1 and read2
-            cover = [x for x in read1 + read2 if i in range(x[0], x[1])]
+            cover = [x for x in reads if (i in range(x[0], x[1]) or i in range(x[2], x[3]))]
             coverage_check.append(len(cover))
-        assert sum(coverage_check) / len(coverage_check) > coverage, f"Coverage check failed for coverage {coverage}"
+        print(f"\nRequested Coverage: {coverage}, Average Coverage: {sum(coverage_check) / len(coverage_check)}")
+        assert sum(coverage_check) / len(coverage_check) > 0.9 * coverage, \
+            f"Coverage check failed for coverage {coverage}"
+        for read in reads:
+            if read[1] - read[0] < 10 or read[3] - read[2] < 10:
+                raise AssertionError("failed to filter out a small read length")
+        assert len(reads) >= (100 * coverage)/20
 
 
 def test_various_read_lengths():
     """Test cover_dataset with various read lengths to ensure no errors"""
-    read_pool = [10] * 2000
-    span_length = 100
-    target_vector = np.full(100, fill_value=10, dtype=int)
+    span_length = 1000
     options = Options(rng_seed=0)
     options.paired_ended = True
     options.coverage = 10
@@ -71,7 +75,7 @@ def test_various_read_lengths():
     for read_len in range(10, 251, 10):
         options.read_len = read_len
         try:
-            read1, _ = cover_dataset(read_pool, span_length, target_vector, options, fragment_model)
+            read1, _ = cover_dataset(span_length, options, fragment_model)
         except Exception as e:
             pytest.fail(f"Test failed for read_len={read_len} with exception: {e}")
 
@@ -100,6 +104,7 @@ def test_fragment_mean_st_dev_combinations():
             except Exception as e:
                 pytest.fail(f"Test failed for mean={mean}, st_dev={st_dev} with exception: {e}")
 
+
 def test_coverage_ploidy_combinations():
     """Test cover_dataset with various combinations of coverage and ploidy values to ensure no errors"""
     read_pool = [10] * 2000

From 19770defc8f7445f90a5192507ec266e1ddeaf2c Mon Sep 17 00:00:00 2001
From: Joshua Allen <jallen17@illinois.edu>
Date: Mon, 20 May 2024 01:10:55 -0500
Subject: [PATCH 2/8] Still working through updates to coverage, properly
 implementing the heterozygosity.

---
 neat/common/ploid_functions.py                |  26 +-
 neat/read_simulator/runner.py                 |   4 +-
 neat/read_simulator/utils/generate_reads.py   | 266 +++++++++++-------
 .../read_simulator/utils/local_file_writer.py |   1 +
 .../utils/output_file_writer.py               |   4 +-
 neat/read_simulator/utils/read.py             | 172 ++++++-----
 .../test_generate_reads.py                    |  23 ++
 7 files changed, 302 insertions(+), 194 deletions(-)

diff --git a/neat/common/ploid_functions.py b/neat/common/ploid_functions.py
index d45682cc..1046c52d 100644
--- a/neat/common/ploid_functions.py
+++ b/neat/common/ploid_functions.py
@@ -28,24 +28,18 @@ def pick_ploids(ploidy: int,
     :return: a list of strings representing the genotype of each ploid.
     """
     # number of ploids to make this mutation on (always at least 1)
+    how_many = 1
     if rng.random() < homozygous_frequency:
-        if ploidy <= 2:
-            # If it's homozygous with a ploidy of 2 it's on both.
-            how_many = ploidy
-        else:
-            # if it's polyploid, it's on more than one ploid
-            # TODO may need to improve the modeling for polyploid
-            how_many = 1
-            for i in range(ploidy):
-                # Not totally sure how to model this, so I'm counting each
-                # ploid as a separate homozygous event. That doesn't exactly make
-                # sense though, so we'll improve this later.
-                if rng.random() < homozygous_frequency:
-                    how_many += 1
-                else:
-                    break
+        # We'll consider this one to be homozygous
+        how_many = ploidy
     else:
-        how_many = 1
+        if ploidy == 1:
+            # special case where heteroyzgous makes no sense
+            how_many = 1
+        else:
+            # if it's polyploid, we'll consider it to be on roughly half the ploids
+            # TODO may need to improve the modeling for polyploid, maybe
+            how_many = ploidy//2
 
     # wp is just the temporary genotype list, a hat tip to the old version of NEAT
     wp = np.zeros(ploidy)
diff --git a/neat/read_simulator/runner.py b/neat/read_simulator/runner.py
index 2e9f4965..8ff25b96 100644
--- a/neat/read_simulator/runner.py
+++ b/neat/read_simulator/runner.py
@@ -175,6 +175,7 @@ def read_simulator_runner(config: str, output: str):
     """
     _LOG.info(f'Reading {options.reference}.')
 
+    # TODO check into SeqIO.index_db()
     reference_index = SeqIO.index(str(options.reference), "fasta")
     _LOG.debug("Reference file indexed.")
 
@@ -301,7 +302,7 @@ def read_simulator_runner(config: str, output: str):
             pass
 
         if options.paired_ended:
-            max_qual_score = max(seq_error_model_1.quality_scores + seq_error_model_2.quality_scores)
+            max_qual_score = max(seq_error_model_1.quality_scores, seq_error_model_2.quality_scores)
         else:
             max_qual_score = max(seq_error_model_1.quality_scores)
 
@@ -335,6 +336,7 @@ def read_simulator_runner(config: str, output: str):
                                local_bam_pickle_file,
                                seq_error_model_1,
                                seq_error_model_2,
+                               mut_model,
                                gc_bias_model,
                                fraglen_model,
                                local_variants,
diff --git a/neat/read_simulator/utils/generate_reads.py b/neat/read_simulator/utils/generate_reads.py
index 7a4bc699..73976665 100644
--- a/neat/read_simulator/utils/generate_reads.py
+++ b/neat/read_simulator/utils/generate_reads.py
@@ -10,7 +10,7 @@
 from numpy.random import Generator
 from bisect import bisect_left, bisect_right
 
-from ...models import SequencingErrorModel, GcModel, FragmentLengthModel
+from ...models import SequencingErrorModel, GcModel, FragmentLengthModel, MutationModel
 from .options import Options
 from ...common import open_input, open_output, ALLOWED_NUCL
 from ...variants import ContigVariants
@@ -19,24 +19,12 @@
 __all__ = [
     'generate_reads',
     'cover_dataset',
+    'overlaps',
 ]
 
 _LOG = logging.getLogger(__name__)
 
 
-def is_too_many_n(segment):
-    """
-    Checks that the segment is valid and there aren't too many invalid characters in it.
-
-    :param segment: the sequence to check
-    :return: True if there are too many invalid characters, or no reads; False otherwise
-    """
-    if not segment:
-        return True
-    n = segment.upper().count('N')
-    return n / len(segment) >= 0.2
-
-
 """
 Currently this code block is unused, but I think it might be useful for parallelization, so I wanted to leave it in.
 
@@ -198,10 +186,33 @@ def merge_sort(my_array: np.ndarray):
     return ret_array
 
 
+def overlaps(test_interval: tuple[int, int], comparison_interval: tuple[int, int]) -> bool:
+    """
+    This function checks if the read overlaps with an input interval.
+    :param test_interval: the interval to test, expressed as a tuple of end points
+        (understood to be a half-open interval)
+    :param comparison_interval: the interval to check against, expressed as a tuple of end points
+
+    Four situations where we can say there is an overlap:
+       1. The comparison interval contains the test interval start point
+       2. The comparison interval contains the test interval end point
+       3. The comparison interval contains both start and end points of the test interval
+       4. The comparison interval is within the test interval
+    Although 3 is really just a special case of 1 and 2, so we don't need a separate check
+
+    If the read is equal to the interval, then all of these will be trivially true,
+    and we don't need a separate check.
+    """
+    return (comparison_interval[0] < test_interval[1] < comparison_interval[1]) or \
+           (comparison_interval[0] <= test_interval[0] < comparison_interval[1]) or \
+           (test_interval[0] <= comparison_interval[0] and test_interval[1] >= comparison_interval[1])
+
+
 def generate_reads(reference: SeqRecord,
                    reads_pickle: str,
                    error_model_1: SequencingErrorModel,
                    error_model_2: SequencingErrorModel | None,
+                   mutation_model: MutationModel,
                    gc_bias: GcModel,
                    fraglen_model: FragmentLengthModel,
                    contig_variants: ContigVariants,
@@ -219,6 +230,7 @@ def generate_reads(reference: SeqRecord,
     :param reads_pickle: The file to put the reads generated into, for bam creation.
     :param error_model_1: The error model for this run, the forward strand
     :param error_model_2: The error model for this run, reverse strand
+    :param mutation_model: The mutation model for this run
     :param gc_bias: The GC-Bias model for this run
     :param fraglen_model: The fragment length model for this run
     :param contig_variants: An object containing all input and randomly generated variants to be included.
@@ -257,13 +269,9 @@ def generate_reads(reference: SeqRecord,
     # heterozygous variants. Okay, so I've been applying the variant to all but for heterozygous we need to apply the
     # variant to only some
 
-    paired_reads = []
+    # These will hold the values as inserted.
+    properly_paired_reads = []
     singletons = []
-    final_sam_dict = {}
-    if options.paired_ended:
-        # This is the orderd the reads will be in when sorted in the final sam file, if we're doing that. They are
-        # easier to sort now as sets of coordinates than they will be later as lines in a file.
-        sam_dict_order = sorted(reads, key=lambda element: (element[0:]))
 
     _LOG.debug("Writing fastq(s) and optional tsam, if indicated")
     t = time.process_time()
@@ -274,94 +282,150 @@ def generate_reads(reference: SeqRecord,
 
         for i in range(len(reads)):
             print(f'{i/len(reads):.2%}', end='\r')
-            # Added some padding after, in case there are deletions
-            segments = [reference[reads[i][0]: reads[i][1] + 50].seq.upper(),
-                        reference[reads[i][2]: reads[i][3] + 50].seq.upper()]
-            # Check for N concentration
-            # Make sure the segments will come out to at around 80% valid bases
-            # So as not to mess up the calculations, we will skip the padding we added above.
-            if not is_too_many_n(segments[0][:-50] + segments[1][:-50]):
-                read_name = f'{base_name}-{str(i)}'
-
-                if options.produce_bam:
-                    # This index gives the position of the read for the final bam file
-                    # [0][0] gives us the first index where this occurs (which should be the only one)
-                    final_order_read_index = np.where(sam_read_order == reads[i])[0][0]
-                    if final_order_read_index not in final_sam_dict:
-                        final_sam_dict[final_order_read_index] = []
-
-                # reads[i] = (left_start, left_end, right_start, right_end)
-                # If there is a read one:
-                if any(reads[i][0:2]):
-                    segment = segments[0]
-                    segment = replace_n(segment, options.rng)
-
-                    if any(reads[i][2:4]):
-                        is_paired = True
-                    else:
-                        is_paired = False
-
-                    read1 = Read(name=read_name + "/1",
-                                 raw_read=reads[i],
-                                 reference_segment=segment,
-                                 reference_id=reference.id,
-                                 position=reads[i][0] + ref_start,
-                                 end_point=reads[i][1] + ref_start,
-                                 quality_offset=options.quality_offset,
-                                 is_paired=is_paired
-                                 )
-
-                    read1.mutations = find_applicable_mutations(read1, contig_variants)
-
-                    read1.finalize_read_and_write(error_model_1, fq1, options.produce_fastq)
-
-                    if options.produce_bam:
-                        # Save this for later
-                        final_sam_dict[final_order_read_index].append(read1)
-
-                elif options.produce_bam:
-                    # If there's no read1, append a 0 placeholder
-                    final_sam_dict[final_order_read_index].append(0)
-
-                if any(reads[i][2:4]):
-                    segment = segments[1]
-                    segment = replace_n(segment, options.rng)
-
-                    if any(reads[i][0:2]):
-                        is_paired = True
-                    else:
-                        is_paired = False
-
-                    read2 = Read(name=read_name + "/2",
-                                 raw_read=reads[i],
-                                 reference_segment=segment,
-                                 reference_id=reference.id,
-                                 position=reads[i][2] + ref_start,
-                                 end_point=reads[i][3] + ref_start,
-                                 quality_offset=options.quality_offset,
-                                 is_reverse=True,
-                                 is_paired=is_paired
-                                 )
-
-                    read2.mutations = find_applicable_mutations(read2, contig_variants)
-                    read2.finalize_read_and_write(error_model_2, fq2, options.produce_fastq)
-
-                    if options.produce_bam:
-                        # Save this for later
-                        final_sam_dict[final_order_read_index].append(read2)
-
-                elif options.produce_bam:
-                    # If there's no read2, append a 0 placeholder
-                    final_sam_dict[final_order_read_index].append(0)
+            # First thing we'll do is check to see if this read is filtered out by a bed file
+            read1, read2 = (reads[i][0], reads[i][1]), (reads[i][2], reads[i][3])
+            found_read1, found_read2 = False, False
+            # For no target bed, there wil only be one region to check and this will complete very quickly
+            for region in targeted_regions:
+                # If this is a false region, we can skip it
+                if not region[2]:
+                    continue
+                # We need to make sure this hasn't been filtered already, so if any coordinate is nonzero (falsey)
+                if any(read1):
+                    # Check if read1 is in this targeted region (any part of it overlaps)
+                    if overlaps(read1, (region[0], region[1])):
+                        found_read1 = True
+                # Again, make sure it hasn't been filtered, or this is a single-ended read
+                elif any(read2):
+                    if overlaps(read2, (region[0], region[1])):
+                        found_read2 = True
+            # This read was outside of targeted regions
+            if not found_read1:
+                # Filter out this read
+                read1 = (0, 0)
+            if not found_read2:
+                # Note that for single ended reads, it will never find read2 and this does nothing (it's already (0,0))
+                read2 = (0, 0)
+
+            # If there was no discard bed, this will complete very quickly
+            discard_read1, discard_read2 = False, False
+            for region in discarded_regions:
+                # If region[2] is False then this region is not being discarded and we can skip it
+                if not region[2]:
+                    continue
+                # Check to make sure the read isn't already filtered out
+                if any(read1):
+                    if overlaps(read1, (region[0], region[1])):
+                        discard_read1 = True
+                # No need to worry about already filtered reads
+                if any(read2):
+                    if overlaps(read2, (region[0], region[1])):
+                        discard_read2 = True
+            if discard_read1:
+                read1 = (0, 0)
+            if discard_read2:
+                read2 = (0, 0)
+
+            # This raw read will replace the original reads[i], and is the raw read with filters applied.
+            raw_read = read1 + read2
+
+            # If both reads were filtered out, we can move along
+            if not any(raw_read):
+                continue
+            else:
+                # We must have at least 1 read that has data
+                # If only read1 or read2 is absent, this is a singleton
+                read1_is_singleton = False
+                read2_is_singleton = False
+                if not any(read2):
+                    # Note that this includes all single ended reads that passed filter
+                    read1_is_singleton = True
+                    singletons.append(raw_read)
+                elif not any(read1):
+                    read2_is_singleton = True
+                    singletons.append(raw_read)
+                else:
+                    properly_paired_reads.append(raw_read)
+
+            read_name = f'{base_name}-{str(i)}'
+
+            # If the other read is marked as a singleton, then this one was filtered out, or these are single-ended
+            if not read2_is_singleton:
+                # It's properly paired if it's not a singleton
+                is_properly_paired = not read1_is_singleton
+                # add a small amount of padding to the end to account for deletions
+                # 30 is basically arbitrary. This may be a function of read length or determinable?
+                # Though we don't figure out the variants until later
+                segment = reference[read1[0]: read1[1] + 30]
+
+                read1 = Read(name=read_name + "/1",
+                             raw_read=raw_read,
+                             reference_segment=segment,
+                             reference_id=reference.id,
+                             position=read1[0] + ref_start,
+                             end_point=read1[1] + ref_start,
+                             quality_offset=options.quality_offset,
+                             is_paired=is_properly_paired
+                             )
+
+                read1.mutations = find_applicable_mutations(read1, contig_variants)
+                read1.finalize_read_and_write(error_model_1, mutation_model, fq1, options.produce_fastq)
+
+            # if read1 is a sinleton then these are single-ended reads or this one was filtered out, se we skip
+            if not read1_is_singleton:
+                is_properly_paired = not read2_is_singleton
+                # Because this segment reads back to front, we need padding at the beginning.
+                # 30 is arbitrary. This may be a function of read length or determinable?
+                # Though we don't figure out the variants until later
+                segment = reference[read2[0] - 30: read2[1]]
+
+                read2 = Read(name=read_name + "/2",
+                             raw_read=reads[i],
+                             reference_segment=segment,
+                             reference_id=reference.id,
+                             position=read2[0] + ref_start,
+                             end_point=read2[1] + ref_start,
+                             quality_offset=options.quality_offset,
+                             is_reverse=True,
+                             is_paired=is_properly_paired
+                             )
+
+                read2.mutations = find_applicable_mutations(read2, contig_variants)
+                read2.finalize_read_and_write(error_model_2, fq2, options.produce_fastq)
 
     _LOG.debug(f"Fastqs written in: {time.process_time() - t} s")
 
+    #TODO figure this bit out
+    if options.produce_bam:
+        # Save this for later
+        raw_sam_dict[final_order_read_index].append((read1, read1_is_singleton))
+
+    if options.produce_bam:
+        # Save this for later
+        raw_sam_dict[final_order_read_index].append((read2, read2_is_singleton))
+
+    # This is the order of the reads as they
+    sam_dict_order = []
+    raw_sam_dict = {}
+    if options.paired_ended:
+        # This is the ordered the reads will be in when sorted in the final sam file, if we're doing that. They are
+        # easier to sort now as sets of coordinates than they will be later as lines in a file.
+        sam_dict_order = sorted(reads, key=lambda element: (element[0:]))
+
+    if options.produce_bam:
+        # This index gives the position of the read for the final bam file
+        # [0][0] gives us the first index where this occurs (which should be the only one)
+        final_order_read_index = np.where(sam_dict_order == reads[i])[0][0]
+        if final_order_read_index not in raw_sam_dict:
+            raw_sam_dict[final_order_read_index] = []
+
+
     if options.produce_bam:
         with open_output(reads_pickle) as reads:
-            pickle.dump(final_sam_dict, reads)
+            pickle.dump(raw_sam_dict, reads)
 
     if options.paired_ended:
-        _LOG.debug(f"Paired percentage = {len(paired_reads)/len(sam_read_order)}")
+        _LOG.debug(f"Paired percentage = {len(reads)/len(sam_dict_order)}")
 
     _LOG.info(f"Finished sampling reads in {time.time() - start} seconds")
     return chrom_fastq_r1, chrom_fastq_r2
diff --git a/neat/read_simulator/utils/local_file_writer.py b/neat/read_simulator/utils/local_file_writer.py
index 607f7fdb..a291d4c3 100644
--- a/neat/read_simulator/utils/local_file_writer.py
+++ b/neat/read_simulator/utils/local_file_writer.py
@@ -47,6 +47,7 @@ def write_local_file(vcf_filename: str or Path,
         for loc in variant_data.variant_locations:
             for variant in variant_data[loc]:
                 if not variant.is_input:
+                    # Todo make sure this section still works
                     target_region = find_region(loc, targeted_regions)
                     # This will be True in targeted regions, if a bed is present, or everywhere if not bed is present.
                     # Anything outside defined target regions will be marked false and this `if` will activate.
diff --git a/neat/read_simulator/utils/output_file_writer.py b/neat/read_simulator/utils/output_file_writer.py
index 244e225a..c065ea46 100755
--- a/neat/read_simulator/utils/output_file_writer.py
+++ b/neat/read_simulator/utils/output_file_writer.py
@@ -340,7 +340,7 @@ def write_bam_record(self, read: Read, contig_id: int, bam_handle: bgzf.BgzfWrit
                      SEQ_PACKED[alt_sequence[2 * i + 1].capitalize()]))
 
         # apparently samtools automatically adds 33 to the quality score string...
-        encoded_qual = ''.join([chr(ord(n) - 33) for n in read.read_quality_score])
+        encoded_qual = ''.join([chr(ord(n) - 33) for n in read.read_quality_string])
 
         """
         block_size = 4 +		# refID 		int32
@@ -357,7 +357,7 @@ def write_bam_record(self, read: Read, contig_id: int, bam_handle: bgzf.BgzfWrit
                      len(seq)
         """
 
-        block_size = 32 + len(read.name) + 1 + len(encoded_cig) + len(encoded_seq) + len(read.read_quality_score)
+        block_size = 32 + len(read.name) + 1 + len(encoded_cig) + len(encoded_seq) + len(read.read_quality_string)
 
         bam_handle.write((pack('<i', block_size) +
                           pack('<i', contig_id) +
diff --git a/neat/read_simulator/utils/read.py b/neat/read_simulator/utils/read.py
index d3d19c67..7ffcf722 100644
--- a/neat/read_simulator/utils/read.py
+++ b/neat/read_simulator/utils/read.py
@@ -7,20 +7,18 @@
 Methods allow comparisons between reads, based on chromosome, start and end. Also, there are methods to retrieve
 both the reference sequence and the read and the actual read sequence.
 """
-import copy
 import logging
 import numpy as np
 
-from pathlib import Path
 from bisect import bisect_right
-from Bio import SeqRecord
-from typing import TextIO, Iterator
-from Bio.Seq import Seq, MutableSeq
+from typing import TextIO
+from Bio.Seq import Seq
 from Bio import pairwise2
-from Bio.pairwise2 import align, format_alignment
+from Bio.pairwise2 import format_alignment
+from numpy.random import Generator
 
 from ...common import ALLOWED_NUCL
-from ...models import SequencingErrorModel, ErrorContainer
+from ...models import SequencingErrorModel, ErrorContainer, MutationModel
 from ...variants import SingleNucleotideVariant, Insertion, Deletion, UnknownVariant
 
 _LOG = logging.getLogger(__name__)
@@ -35,8 +33,8 @@ class Read:
         Which gives information about the left and paired right read, or gives 0,0 for missing component
     :param reference_segment: The reference segment this read is drawn from
     :param reference_id: the ID for the reference where the segment is drawn from
-    :param position: First position of the read
-    :param end_point: End point of the read
+    :param position: First position of the read, relative to the reference
+    :param end_point: End point of the read, relative to the reference
     :param is_reverse: Whether the read is reversed
     :param is_paired: Whether this read has a proper pair.
     """
@@ -64,12 +62,12 @@ def __init__(self,
 
         # These parameters won't be initialized on the initial read creation, but will
         # be generated by functions below.
-        self.read_sequence: Seq | None = None
-        self.errors: list[ErrorContainer] | None = None
-        self.mutations: dict[int: list] | None = None
-        self.quality_array: np.ndarray | None = None
-        self.mapping_quality: int | None = None
-        self.read_quality_score: str | None = None
+        self.read_sequence: Seq = Seq("")  # initialize to empty sequence
+        self.errors: list[ErrorContainer] = []  # initialize
+        self.mutations: dict[int: list] = {}  # initialize
+        self.quality_array: np.ndarray = np.zeros(self.length)  # this will have the correct memory length
+        self.mapping_quality: int = 0  # initialize at 0
+        self.read_quality_string: str = ""  # This will hold the read quality string
 
     def __repr__(self):
         return f"{self.reference_id}: {self.position}-{self.end_point}"
@@ -117,7 +115,14 @@ def __len__(self):
         return self.length
 
     def update_quality_array(
-            self, reference, alternate, location, variant_type: str, model: SequencingErrorModel, quality_score: int = 0
+            self,
+            reference: Seq,
+            alternate: Seq,
+            location: int,
+            variant_type: str,
+            quality_scores: list,
+            rng: Generator,
+            quality_score: int = 0
     ):
         """
         This updates the quality score based on the error model. Uniform for mutations, random (but low) for errors
@@ -125,7 +130,8 @@ def update_quality_array(
         :param alternate: The alternate sequence for the variant
         :param location: The first position of the mutation, in 0-based coordinates
         :param variant_type: Either "mutations" or "errors"
-        :param model: The error model, used to generate any quality scores needed to fill out array
+        :param quality_scores: The possible quality scores, used to adjust for mutations and errors
+        :param rng: The random number generator for the run
         :param quality_score: The quality score to use, since this has already been calculated for mutations.
             This must be included if type is 'mutation'
         :return: None, updates the quality array in place
@@ -137,18 +143,17 @@ def update_quality_array(
             original_quality_score = self.quality_array[location: location+len(reference)]
             new_quality_score = original_quality_score.copy()
             # Generate a pool of low scores to choose from:
-            scores = model.quality_scores
-            low_scores = scores[:bisect_right(scores, max(scores) / 2)]
+            low_scores = quality_scores[:bisect_right(quality_score, max(quality_scores) // 3)]
             # If is insertion
             if len(alternate) > 1:
                 # Generate extra scores for insertions
-                new_quality_score.extend(model.rng.choice(low_scores, size=len(alternate)-1))
+                new_quality_score.extend(rng.choice(low_scores, size=len(alternate)-1))
             # If is deletion
             elif len(reference) > 1 and len(alternate) == 1:
                 new_quality_score = new_quality_score[:1]
             # SNP
             else:
-                new_quality_score = model.rng.choice(low_scores, size=1)
+                new_quality_score = rng.choice(low_scores, size=1)
 
         # Replace the given quality score with the new one
         self.quality_array = \
@@ -156,13 +161,6 @@ def update_quality_array(
                             np.array(new_quality_score),
                             self.quality_array[location+len(reference):]))
 
-        if len(self.quality_array) < self.length:
-            # Just in case we need to fill out the quality score array
-            temp_list = self.quality_array.tolist()
-            self.quality_array = np.array(temp_list.append(
-                model.rng.choice(model.quality_scores, size=self.length-len(self.quality_array))
-            ))
-
     def apply_errors(self, mutated_sequence, err_model):
         """
         This function applies errors to a sequence and calls the update_quality_array function after
@@ -176,68 +174,89 @@ def apply_errors(self, mutated_sequence, err_model):
             mutated_sequence = \
                 mutated_sequence[:error.location] + error.alt + mutated_sequence[error.location+len(error.ref):]
             # update quality score for error
-            self.update_quality_array(error.ref, error.alt, error.location, "error", err_model)
+            self.update_quality_array(
+                error.ref,
+                error.alt,
+                error.location,
+                "error",
+                err_model.quality_scores,
+                err_model.rng
+            )
 
         return mutated_sequence
 
-    def apply_mutations(self, mutated_sequence, err_model):
+    def apply_mutations(self, mutated_sequence: Seq, quality_scores: list, mut_model: MutationModel):
         """
         Applying mutations involves one extra step, because of polyploidism. There may be more than one mutation
         at a given location, so it is formulated as a list. We then pick one at random for this read.
 
         :param mutated_sequence: The sequence we are applying the variant to
-        :param err_model: The error_model for the run, used for the quality score and for the rng
+        :param quality_scores: The possible quality scores for this run (to update quality scores)
+        :param mut_model: The mutation model for the run, used for the zygosity and for the rng
         :return: mutated sequence, with mutations applied
         """
-        # Start at the right position based on if this is the forward or reverse read.
-        segment_start = int(self.raw_read[2]) if self.is_reverse else int(self.raw_read[0])
 
         for location in self.mutations:
-            # There may be more than one variant to apply, because of polyploidism.
-            # So we'll pick one at random for this read, each with equal probability.
-            # We may want to add a model to this.
-            variant_to_apply = err_model.rng.choice(self.mutations[location])
+            variant_to_apply = mut_model.rng.choice(self.mutations[location])
+
             # Fetch parameters
             qual_score = variant_to_apply.get_qual_score()
-            position = variant_to_apply.get_0_location() - segment_start
-            if type(variant_to_apply) == Insertion or type(variant_to_apply) == SingleNucleotideVariant:
-                reference_length = 1
-                alternate = variant_to_apply.get_alt()
-            elif type(variant_to_apply) == Deletion:
-                reference_length = variant_to_apply.length
-                alternate = mutated_sequence[position]
-            else:
-                reference_length = variant_to_apply.get_ref_len()
-                alternate = mutated_sequence.get_alt()
-
-            # Replace the entire ref with the entire alt
-            mutated_sequence = \
-                mutated_sequence[:position] + alternate + mutated_sequence[position + reference_length:]
-
-            self.update_quality_array(
-                self.reference_segment[location: location+reference_length], alternate, location, "mutation", err_model, qual_score
-            )
+            # Find the position within the read for this variant
+            position = variant_to_apply.get_0_location() - self.position
+            # Figure out if the variant is in this read. If 'to_mutate' selects any 1, then it is mutated.
+            to_mutate = mut_model.rng.choice(variant_to_apply.genotype)
+            # If a 1 was selected, then apply the variant, else return the original sequence
+            if to_mutate:
+                if type(variant_to_apply) == Insertion or type(variant_to_apply) == SingleNucleotideVariant:
+                    reference_length = 1
+                    alternate = variant_to_apply.get_alt()
+                elif type(variant_to_apply) == Deletion:
+                    reference_length = variant_to_apply.length
+                    alternate = mutated_sequence[position]
+                else:
+                    reference_length = variant_to_apply.get_ref_len()
+                    alternate = mutated_sequence.get_alt()
+
+                # Replace the entire ref with the entire alt
+                mutated_sequence = mutated_sequence[:position] + alternate + mutated_sequence[position + reference_length:]
+
+                self.update_quality_array(
+                    self.reference_segment[location: location+reference_length],
+                    alternate,
+                    location,
+                    "mutation",
+                    quality_scores,
+                    mut_model.rng,
+                    qual_score
+                )
 
         return mutated_sequence
 
-    def get_read(self, error_model) -> Seq:
+    def apply_variants_for_final_output(
+            self,
+            error_model: SequencingErrorModel,
+            mutation_model: MutationModel
+    ) -> Seq:
         """
         Gets mutated sequence to output for fastq/bam
 
         :param error_model: The error model for the run
+        :param mutation_model: The mutation model for the run
         :return: the mutated sequence
         """
 
-        mutated_sequence = self.reference_segment
+        # TODO Add filtering for bed files and heterozygosity right here
+        # The read sequence should have some padding still at this point
+        read_sequence = self.read_sequence.seq
         if self.mutations:
-            mutated_sequence = self.apply_mutations(mutated_sequence, error_model)
+            read_sequence = self.apply_mutations(read_sequence, list(error_model.quality_scores), mutation_model)
         if self.errors:
-            mutated_sequence = self.apply_errors(mutated_sequence, error_model)
+            read_sequence = self.apply_errors(read_sequence, error_model)
 
-        mutated_sequence = mutated_sequence[:self.length]
+        read_sequence = read_sequence[:self.length]
         self.quality_array = self.quality_array[:self.length]
 
-        return Seq(mutated_sequence)
+        return Seq(read_sequence)
 
     def contains(self, test_pos: int):
         return self.position <= test_pos < self.end_point
@@ -275,6 +294,7 @@ def calculate_flags(self, paired_ended_run):
     def finalize_read_and_write(
             self,
             err_model: SequencingErrorModel,
+            mut_model: MutationModel,
             fastq_handle: TextIO,
             produce_fastq: bool,
     ):
@@ -282,6 +302,7 @@ def finalize_read_and_write(
         Writes the record to the temporary fastq file
 
         :param err_model: The error model for the run
+        :param mut_model: The mutation model for the run
         :param fastq_handle: the path to the fastq model to write the read
         :param produce_fastq: If true, this will write out the temp fastqs. If false, this will only write out the tsams
             to create the bam files.
@@ -290,28 +311,31 @@ def finalize_read_and_write(
         # Generate quality scores to cover the extended segment. We'll trim later
         self.quality_array = err_model.get_quality_scores(len(self.reference_segment))
 
-        # Get errors for the read
-        self.errors = err_model.get_sequencing_errors(self.length, self.reference_segment,
-                                                      self.quality_array)
-
-        read_sequence = self.get_read(err_model)
-
         if self.is_reverse:
-            self.quality_array = self.quality_array[::-1]
-            read_sequence = read_sequence.reverse_complement()
-            temp_position = self.position
-            self.position = len(self.reference_segment) - temp_position - self.length
-            self.end_point = len(self.reference_segment) - temp_position
+            read_quality_array = self.quality_array[::-1]
+            read_sequence = self.reference_segment.reverse_complement()
+        else:
+            read_quality_array = self.quality_array
+            read_sequence = self.reference_segment
+
+        # Get errors for the read and update the quality score
+        self.errors = err_model.get_sequencing_errors(self.length, read_sequence,
+                                                      read_quality_array)
 
+        # Update the read as needed:
+        self.quality_array = read_quality_array
         self.read_sequence = read_sequence
-        self.read_quality_score = "".join([chr(x + self.quality_offset) for x in self.quality_array])
+
+        self.read_sequence = self.apply_variants_for_final_output(err_model, mut_model)
+
+        self.read_quality_string = "".join([chr(x + self.quality_offset) for x in self.quality_array])
         self.mapping_quality = err_model.rng.poisson(70)
 
         if produce_fastq:
             fastq_handle.write(f'@{self.name}\n')
             fastq_handle.write(f'{str(self.read_sequence)}\n')
             fastq_handle.write('+\n')
-            fastq_handle.write(f'{self.read_quality_score}\n')
+            fastq_handle.write(f'{self.read_quality_string}\n')
 
     def make_cigar(self):
         """
diff --git a/tests/test_read_simulator/test_generate_reads.py b/tests/test_read_simulator/test_generate_reads.py
index cbe1f996..883668ac 100644
--- a/tests/test_read_simulator/test_generate_reads.py
+++ b/tests/test_read_simulator/test_generate_reads.py
@@ -158,3 +158,26 @@ def test_single_ended_mode():
         assert sum(coverage_check) / len(coverage_check) > options.coverage, "Coverage check failed in single-ended mode"
     except Exception as e:
         pytest.fail(f"Test failed in single-ended mode with exception: {e}")
+
+
+def test_overlaps():
+    comparsion_interval = (120, 400)
+
+    interval1 = (0, 151)
+    assert overlaps(interval1, comparsion_interval)  # overlaps
+    interval1 = (385, 421)
+    assert overlaps(interval1, comparsion_interval)  # overlaps
+    interval1 = (130, 281)
+    assert overlaps(interval1, comparsion_interval)  # overlaps
+    interval1 = (120, 400)
+    assert overlaps(interval1, comparsion_interval)  # overlaps
+    interval1 = (100, 500)
+    assert overlaps(interval1, comparsion_interval)  # overlaps
+    interval1 = (0, 100)
+    assert not overlaps(interval1, comparsion_interval)  # doesn't overlap
+    interval1 = (0, 120)
+    assert not overlaps(interval1, comparsion_interval)  # doesn't overlap
+    interval1 = (400, 450)
+    assert not overlaps(interval1, comparsion_interval)  # doesn't overlap
+    interval1 = (500, 700)
+    assert not overlaps(interval1, comparsion_interval)  # doesn't overlap

From 70b9dc53c9e5abc18b83144359b54dabaede079f Mon Sep 17 00:00:00 2001
From: Joshua Allen <jallen17@illinois.edu>
Date: Mon, 20 May 2024 23:38:34 -0500
Subject: [PATCH 3/8] new coverage working

---
 neat/cli/cli.py                             |  2 +-
 neat/models/models.py                       |  6 +-
 neat/read_simulator/runner.py               |  2 +-
 neat/read_simulator/utils/generate_reads.py | 63 ++++++++-------------
 neat/read_simulator/utils/read.py           | 43 ++++++++------
 5 files changed, 57 insertions(+), 59 deletions(-)

diff --git a/neat/cli/cli.py b/neat/cli/cli.py
index 3a41ddf2..826a13d7 100644
--- a/neat/cli/cli.py
+++ b/neat/cli/cli.py
@@ -140,7 +140,7 @@ def main(parser: argparse.ArgumentParser, arguments: list[str]) -> int:
         else:
             end = time.time()
             log.info(
-                "command finished successfully; execution took %.3f sec", end - start
+                f"command finished successfully; execution took {(end - start)/60:.2f} sec"
             )
         return 0
 
diff --git a/neat/models/models.py b/neat/models/models.py
index fef6f270..e8f37559 100644
--- a/neat/models/models.py
+++ b/neat/models/models.py
@@ -417,6 +417,9 @@ def get_sequencing_errors(self,
         del_blacklist = []
 
         for index in error_indexes[::-1]:
+            # Skip any N's
+            if reference_segment[index] == 'N':
+                continue
             # determine error type. Most will be SNVs
             error_type = SingleNucleotideVariant
 
@@ -622,5 +625,4 @@ def generate_fragments(self,
         """
         # generates a distribution, assuming normality, then rounds the result and converts to ints
         dist = np.round(self.rng.normal(self.fragment_mean, self.fragment_st_dev, size=number_of_fragments)).astype(int)
-
-        return list(dist)
+        return [abs(x) for x in dist]
diff --git a/neat/read_simulator/runner.py b/neat/read_simulator/runner.py
index 8ff25b96..cf70a4f5 100644
--- a/neat/read_simulator/runner.py
+++ b/neat/read_simulator/runner.py
@@ -302,7 +302,7 @@ def read_simulator_runner(config: str, output: str):
             pass
 
         if options.paired_ended:
-            max_qual_score = max(seq_error_model_1.quality_scores, seq_error_model_2.quality_scores)
+            max_qual_score = max(max(seq_error_model_1.quality_scores), max(seq_error_model_2.quality_scores))
         else:
             max_qual_score = max(seq_error_model_1.quality_scores)
 
diff --git a/neat/read_simulator/utils/generate_reads.py b/neat/read_simulator/utils/generate_reads.py
index 73976665..c68c20de 100644
--- a/neat/read_simulator/utils/generate_reads.py
+++ b/neat/read_simulator/utils/generate_reads.py
@@ -260,7 +260,7 @@ def generate_reads(reference: SeqRecord,
         options,
         fraglen_model,
     )
-    _LOG.debug(f"Dataset coverage took: {time.process_time() - t}")
+    _LOG.debug(f"Dataset coverage took: {(time.process_time() - t)/60} m")
 
     # Filters left to apply: N filter... not sure yet what to do with those (output random low quality reads, maybe)
     # Target regions
@@ -337,22 +337,22 @@ def generate_reads(reference: SeqRecord,
                 # If only read1 or read2 is absent, this is a singleton
                 read1_is_singleton = False
                 read2_is_singleton = False
+                properly_paired = False
                 if not any(read2):
                     # Note that this includes all single ended reads that passed filter
                     read1_is_singleton = True
-                    singletons.append(raw_read)
                 elif not any(read1):
+                    # read1 got filtered out
                     read2_is_singleton = True
-                    singletons.append(raw_read)
                 else:
-                    properly_paired_reads.append(raw_read)
+                    properly_paired = True
 
             read_name = f'{base_name}-{str(i)}'
 
             # If the other read is marked as a singleton, then this one was filtered out, or these are single-ended
             if not read2_is_singleton:
                 # It's properly paired if it's not a singleton
-                is_properly_paired = not read1_is_singleton
+                is_paired = not read1_is_singleton
                 # add a small amount of padding to the end to account for deletions
                 # 30 is basically arbitrary. This may be a function of read length or determinable?
                 # Though we don't figure out the variants until later
@@ -365,7 +365,7 @@ def generate_reads(reference: SeqRecord,
                              position=read1[0] + ref_start,
                              end_point=read1[1] + ref_start,
                              quality_offset=options.quality_offset,
-                             is_paired=is_properly_paired
+                             is_paired=is_paired
                              )
 
                 read1.mutations = find_applicable_mutations(read1, contig_variants)
@@ -373,7 +373,7 @@ def generate_reads(reference: SeqRecord,
 
             # if read1 is a sinleton then these are single-ended reads or this one was filtered out, se we skip
             if not read1_is_singleton:
-                is_properly_paired = not read2_is_singleton
+                is_paired = not read2_is_singleton
                 # Because this segment reads back to front, we need padding at the beginning.
                 # 30 is arbitrary. This may be a function of read length or determinable?
                 # Though we don't figure out the variants until later
@@ -387,45 +387,32 @@ def generate_reads(reference: SeqRecord,
                              end_point=read2[1] + ref_start,
                              quality_offset=options.quality_offset,
                              is_reverse=True,
-                             is_paired=is_properly_paired
+                             is_paired=is_paired
                              )
 
                 read2.mutations = find_applicable_mutations(read2, contig_variants)
-                read2.finalize_read_and_write(error_model_2, fq2, options.produce_fastq)
-
-    _LOG.debug(f"Fastqs written in: {time.process_time() - t} s")
-
-    #TODO figure this bit out
-    if options.produce_bam:
-        # Save this for later
-        raw_sam_dict[final_order_read_index].append((read1, read1_is_singleton))
-
-    if options.produce_bam:
-        # Save this for later
-        raw_sam_dict[final_order_read_index].append((read2, read2_is_singleton))
+                read2.finalize_read_and_write(error_model_2, mutation_model, fq2, options.produce_fastq)
+            if properly_paired:
+                properly_paired_reads.append((read1, read2))
+            elif read1_is_singleton:
+                # This will be the choice for all single-ended reads
+                singletons.append(read1)
+            else:
+                singletons.append(read2)
 
-    # This is the order of the reads as they
-    sam_dict_order = []
-    raw_sam_dict = {}
-    if options.paired_ended:
-        # This is the ordered the reads will be in when sorted in the final sam file, if we're doing that. They are
-        # easier to sort now as sets of coordinates than they will be later as lines in a file.
-        sam_dict_order = sorted(reads, key=lambda element: (element[0:]))
+    _LOG.debug(f"Fastqs written in: {(time.process_time() - t)/60:.2f} m")
 
     if options.produce_bam:
-        # This index gives the position of the read for the final bam file
-        # [0][0] gives us the first index where this occurs (which should be the only one)
-        final_order_read_index = np.where(sam_dict_order == reads[i])[0][0]
-        if final_order_read_index not in raw_sam_dict:
-            raw_sam_dict[final_order_read_index] = []
-
+        # this will give us the proper read order of the elements, for the sam. They are easier to sort now
+        properly_paired_reads = sorted(properly_paired_reads)
+        singletons = sorted(singletons)
+        sam_order = properly_paired_reads + singletons
 
-    if options.produce_bam:
         with open_output(reads_pickle) as reads:
-            pickle.dump(raw_sam_dict, reads)
+            pickle.dump(sam_order, reads)
 
-    if options.paired_ended:
-        _LOG.debug(f"Paired percentage = {len(reads)/len(sam_dict_order)}")
+        if options.paired_ended:
+            _LOG.debug(f"Properly paired percentage = {len(properly_paired_reads)/len(sam_order)}")
 
-    _LOG.info(f"Finished sampling reads in {time.time() - start} seconds")
+    _LOG.info(f"Finished sampling reads in {(time.process_time() - start)/60:.2f} m")
     return chrom_fastq_r1, chrom_fastq_r2
diff --git a/neat/read_simulator/utils/read.py b/neat/read_simulator/utils/read.py
index 7ffcf722..bebe95ea 100644
--- a/neat/read_simulator/utils/read.py
+++ b/neat/read_simulator/utils/read.py
@@ -12,7 +12,7 @@
 
 from bisect import bisect_right
 from typing import TextIO
-from Bio.Seq import Seq
+from Bio.Seq import Seq, MutableSeq
 from Bio import pairwise2
 from Bio.pairwise2 import format_alignment
 from numpy.random import Generator
@@ -121,7 +121,6 @@ def update_quality_array(
             location: int,
             variant_type: str,
             quality_scores: list,
-            rng: Generator,
             quality_score: int = 0
     ):
         """
@@ -139,21 +138,21 @@ def update_quality_array(
         if variant_type == "mutation":
             new_quality_score = [quality_score] * len(alternate)
         else:
-            # Since we have an error here, we'll choose a lower score
+            # Since we have an error here, we'll choose a min score
             original_quality_score = self.quality_array[location: location+len(reference)]
             new_quality_score = original_quality_score.copy()
-            # Generate a pool of low scores to choose from:
-            low_scores = quality_scores[:bisect_right(quality_score, max(quality_scores) // 3)]
+            low_score = min(quality_scores)
             # If is insertion
             if len(alternate) > 1:
                 # Generate extra scores for insertions
-                new_quality_score.extend(rng.choice(low_scores, size=len(alternate)-1))
+                # Original ref is unaffected, so it's quality score remains the same
+                new_quality_score = [low_score] * (len(alternate) - 1)
             # If is deletion
             elif len(reference) > 1 and len(alternate) == 1:
-                new_quality_score = new_quality_score[:1]
+                new_quality_score = []
             # SNP
             else:
-                new_quality_score = rng.choice(low_scores, size=1)
+                new_quality_score = [low_score]
 
         # Replace the given quality score with the new one
         self.quality_array = \
@@ -161,7 +160,7 @@ def update_quality_array(
                             np.array(new_quality_score),
                             self.quality_array[location+len(reference):]))
 
-    def apply_errors(self, mutated_sequence, err_model):
+    def apply_errors(self, mutated_sequence: MutableSeq, err_model: SequencingErrorModel):
         """
         This function applies errors to a sequence and calls the update_quality_array function after
 
@@ -180,12 +179,11 @@ def apply_errors(self, mutated_sequence, err_model):
                 error.location,
                 "error",
                 err_model.quality_scores,
-                err_model.rng
             )
 
         return mutated_sequence
 
-    def apply_mutations(self, mutated_sequence: Seq, quality_scores: list, mut_model: MutationModel):
+    def apply_mutations(self, mutated_sequence: MutableSeq, quality_scores: list, mut_model: MutationModel):
         """
         Applying mutations involves one extra step, because of polyploidism. There may be more than one mutation
         at a given location, so it is formulated as a list. We then pick one at random for this read.
@@ -201,8 +199,8 @@ def apply_mutations(self, mutated_sequence: Seq, quality_scores: list, mut_model
 
             # Fetch parameters
             qual_score = variant_to_apply.get_qual_score()
-            # Find the position within the read for this variant
-            position = variant_to_apply.get_0_location() - self.position
+            # Find the position within the read for this variant, cast as a python int, instead of a numpy int
+            position = int(variant_to_apply.get_0_location() - self.position)
             # Figure out if the variant is in this read. If 'to_mutate' selects any 1, then it is mutated.
             to_mutate = mut_model.rng.choice(variant_to_apply.genotype)
             # If a 1 was selected, then apply the variant, else return the original sequence
@@ -212,7 +210,12 @@ def apply_mutations(self, mutated_sequence: Seq, quality_scores: list, mut_model
                     alternate = variant_to_apply.get_alt()
                 elif type(variant_to_apply) == Deletion:
                     reference_length = variant_to_apply.length
-                    alternate = mutated_sequence[position]
+                    try:
+                        alternate = mutated_sequence[position]
+                    except:
+                        print(f"sequence: {mutated_sequence}")
+                        print(f"position: {position}")
+                        raise
                 else:
                     reference_length = variant_to_apply.get_ref_len()
                     alternate = mutated_sequence.get_alt()
@@ -226,7 +229,6 @@ def apply_mutations(self, mutated_sequence: Seq, quality_scores: list, mut_model
                     location,
                     "mutation",
                     quality_scores,
-                    mut_model.rng,
                     qual_score
                 )
 
@@ -245,9 +247,16 @@ def apply_variants_for_final_output(
         :return: the mutated sequence
         """
 
-        # TODO Add filtering for bed files and heterozygosity right here
+        rng = mutation_model.rng
         # The read sequence should have some padding still at this point
-        read_sequence = self.read_sequence.seq
+        read_sequence = MutableSeq(self.read_sequence.seq)
+        # Deal with N's
+        for i in range(len(read_sequence)):
+            if read_sequence[i] == 'N':
+                # for now replace 'N' masked regions with pure noise. We may refine this in time
+                read_sequence[i] = rng.choice(['A', 'C', 'G', 'T'])
+                self.quality_array[i] = min(error_model.quality_scores)
+
         if self.mutations:
             read_sequence = self.apply_mutations(read_sequence, list(error_model.quality_scores), mutation_model)
         if self.errors:

From ed51167bba4d2503386f6de2e9bcc252ffe862af Mon Sep 17 00:00:00 2001
From: Joshua Allen <jallen17@illinois.edu>
Date: Thu, 23 May 2024 12:36:03 -0500
Subject: [PATCH 4/8] The changed code seems to be working. Will run a few more
 tests before issuing the PR

---
 neat/cli/cli.py                               |   2 +-
 neat/read_simulator/runner.py                 |   5 +-
 neat/read_simulator/utils/generate_reads.py   | 115 +++++++++------
 .../utils/output_file_writer.py               | 137 +++++++++++-------
 neat/read_simulator/utils/read.py             |  28 ++--
 5 files changed, 174 insertions(+), 113 deletions(-)

diff --git a/neat/cli/cli.py b/neat/cli/cli.py
index 826a13d7..848c8fd2 100644
--- a/neat/cli/cli.py
+++ b/neat/cli/cli.py
@@ -140,7 +140,7 @@ def main(parser: argparse.ArgumentParser, arguments: list[str]) -> int:
         else:
             end = time.time()
             log.info(
-                f"command finished successfully; execution took {(end - start)/60:.2f} sec"
+                f"command finished successfully; execution took {(end - start)/60:.2f} m"
             )
         return 0
 
diff --git a/neat/read_simulator/runner.py b/neat/read_simulator/runner.py
index cf70a4f5..0c6cd447 100644
--- a/neat/read_simulator/runner.py
+++ b/neat/read_simulator/runner.py
@@ -331,7 +331,7 @@ def read_simulator_runner(config: str, output: str):
             fasta_files.extend(local_fasta_file)
 
         if options.produce_fastq or options.produce_bam:
-            read1_fastq, read2_fastq = \
+            read1_fastq_paired, read1_fastq_single, read2_fastq_paired, read2_fastq_single = \
                 generate_reads(local_reference,
                                local_bam_pickle_file,
                                seq_error_model_1,
@@ -346,7 +346,8 @@ def read_simulator_runner(config: str, output: str):
                                options,
                                contig)
 
-            fastq_files.append((read1_fastq, read2_fastq))
+            contig_temp_fastqs = ((read1_fastq_paired, read2_fastq_paired), (read1_fastq_single, read2_fastq_single))
+            fastq_files.append(contig_temp_fastqs)
             if options.produce_bam:
                 sam_reads_files.append(local_bam_pickle_file)
 
diff --git a/neat/read_simulator/utils/generate_reads.py b/neat/read_simulator/utils/generate_reads.py
index c68c20de..c158f87c 100644
--- a/neat/read_simulator/utils/generate_reads.py
+++ b/neat/read_simulator/utils/generate_reads.py
@@ -16,6 +16,8 @@
 from ...variants import ContigVariants
 from .read import Read
 
+# TODO check that we're not truncating reads with deletions, but getting a full 151 bases
+
 __all__ = [
     'generate_reads',
     'cover_dataset',
@@ -245,13 +247,20 @@ def generate_reads(reference: SeqRecord,
     :return: A tuple of the filenames for the temp files created
     """
     # Set up files for use. May not need r2, but it's there if we do.
-    chrom_fastq_r1 = temporary_directory / f'{chrom}_tmp_r1.fq.bgz'
-    chrom_fastq_r2 = temporary_directory / f'{chrom}_tmp_r2.fq.bgz'
+    # We will separate the properly paired and the singletons.
+    # For now, we are making an assumption that the chromosome name contains no invalid characters for bash file names
+    # such as `*` or `:` even though those are technically allowed.
+    # We need to insure "chrom" does not contain `_` but we will assume that is the case for now.
+    # TODO We'll need to add some checks to ensure that this is the case.
+    chrom_fastq_r1_paired = temporary_directory / f'{chrom}_r1_paired.fq.bgz'
+    chrom_fastq_r1_single = temporary_directory / f'{chrom}_r1_single.fq.bgz'
+    chrom_fastq_r2_paired = temporary_directory / f'{chrom}_r2_paired.fq.bgz'
+    chrom_fastq_r2_single = temporary_directory / f'{chrom}_r2_single.fq.bgz'
 
     _LOG.info(f'Sampling reads...')
-    start = time.time()
+    start_time = time.time()
 
-    base_name = f'{Path(options.output).name}-{chrom}'
+    base_name = f'NEAT-generated_{chrom}'
 
     _LOG.debug("Covering dataset.")
     t = time.process_time()
@@ -260,7 +269,7 @@ def generate_reads(reference: SeqRecord,
         options,
         fraglen_model,
     )
-    _LOG.debug(f"Dataset coverage took: {(time.process_time() - t)/60} m")
+    _LOG.debug(f"Dataset coverage took: {(time.process_time() - t)/60:.2f} m")
 
     # Filters left to apply: N filter... not sure yet what to do with those (output random low quality reads, maybe)
     # Target regions
@@ -276,8 +285,10 @@ def generate_reads(reference: SeqRecord,
     _LOG.debug("Writing fastq(s) and optional tsam, if indicated")
     t = time.process_time()
     with (
-        open_output(chrom_fastq_r1) as fq1,
-        open_output(chrom_fastq_r2) as fq2
+        open_output(chrom_fastq_r1_paired) as fq1_paired,
+        open_output(chrom_fastq_r1_single) as fq1_single,
+        open_output(chrom_fastq_r2_paired) as fq2_paired,
+        open_output(chrom_fastq_r2_single) as fq2_single
     ):
 
         for i in range(len(reads)):
@@ -296,10 +307,10 @@ def generate_reads(reference: SeqRecord,
                     if overlaps(read1, (region[0], region[1])):
                         found_read1 = True
                 # Again, make sure it hasn't been filtered, or this is a single-ended read
-                elif any(read2):
+                if any(read2):
                     if overlaps(read2, (region[0], region[1])):
                         found_read2 = True
-            # This read was outside of targeted regions
+            # This read was outside targeted regions
             if not found_read1:
                 # Filter out this read
                 read1 = (0, 0)
@@ -347,7 +358,7 @@ def generate_reads(reference: SeqRecord,
                 else:
                     properly_paired = True
 
-            read_name = f'{base_name}-{str(i)}'
+            read_name = f'{base_name}_{str(i)}'
 
             # If the other read is marked as a singleton, then this one was filtered out, or these are single-ended
             if not read2_is_singleton:
@@ -356,20 +367,27 @@ def generate_reads(reference: SeqRecord,
                 # add a small amount of padding to the end to account for deletions
                 # 30 is basically arbitrary. This may be a function of read length or determinable?
                 # Though we don't figure out the variants until later
-                segment = reference[read1[0]: read1[1] + 30]
 
-                read1 = Read(name=read_name + "/1",
-                             raw_read=raw_read,
-                             reference_segment=segment,
-                             reference_id=reference.id,
-                             position=read1[0] + ref_start,
-                             end_point=read1[1] + ref_start,
-                             quality_offset=options.quality_offset,
-                             is_paired=is_paired
-                             )
-
-                read1.mutations = find_applicable_mutations(read1, contig_variants)
-                read1.finalize_read_and_write(error_model_1, mutation_model, fq1, options.produce_fastq)
+                # this start check ensures that we get a valid segment
+                segment = reference[read1[0]: read1[1] + 30].seq
+
+                read_1 = Read(
+                    name=read_name + "/1",
+                    raw_read=raw_read,
+                    reference_segment=segment,
+                    reference_id=reference.id,
+                    position=read1[0] + ref_start,
+                    end_point=read1[1] + ref_start,
+                    quality_offset=options.quality_offset,
+                    is_paired=is_paired
+                )
+
+                read_1.mutations = find_applicable_mutations(read_1, contig_variants)
+                if is_paired:
+                    handle = fq1_paired
+                else:
+                    handle = fq1_single
+                read_1.finalize_read_and_write(error_model_1, mutation_model, handle, options.produce_fastq)
 
             # if read1 is a sinleton then these are single-ended reads or this one was filtered out, se we skip
             if not read1_is_singleton:
@@ -377,30 +395,39 @@ def generate_reads(reference: SeqRecord,
                 # Because this segment reads back to front, we need padding at the beginning.
                 # 30 is arbitrary. This may be a function of read length or determinable?
                 # Though we don't figure out the variants until later
-                segment = reference[read2[0] - 30: read2[1]]
-
-                read2 = Read(name=read_name + "/2",
-                             raw_read=reads[i],
-                             reference_segment=segment,
-                             reference_id=reference.id,
-                             position=read2[0] + ref_start,
-                             end_point=read2[1] + ref_start,
-                             quality_offset=options.quality_offset,
-                             is_reverse=True,
-                             is_paired=is_paired
-                             )
-
-                read2.mutations = find_applicable_mutations(read2, contig_variants)
-                read2.finalize_read_and_write(error_model_2, mutation_model, fq2, options.produce_fastq)
+
+                # this start check ensures that we get a valid segment
+                start_cooridnate = max((read2[0] - 30), 0)
+                segment = reference[start_cooridnate: read2[1]].seq
+
+                read_2 = Read(
+                    name=read_name + "/2",
+                    raw_read=reads[i],
+                    reference_segment=segment,
+                    reference_id=reference.id,
+                    position=read2[0] + ref_start,
+                    end_point=read2[1] + ref_start,
+                    quality_offset=options.quality_offset,
+                    is_reverse=True,
+                    is_paired=is_paired
+                )
+
+                read_2.mutations = find_applicable_mutations(read_2, contig_variants)
+                if is_paired:
+                    handle = fq2_paired
+                else:
+                    handle = fq2_single
+                read_2.finalize_read_and_write(error_model_2, mutation_model, handle, options.produce_fastq)
+
             if properly_paired:
-                properly_paired_reads.append((read1, read2))
+                properly_paired_reads.append((read_1, read_2))
             elif read1_is_singleton:
                 # This will be the choice for all single-ended reads
-                singletons.append(read1)
+                singletons.append((read_1, None))
             else:
-                singletons.append(read2)
+                singletons.append((None, read_2))
 
-    _LOG.debug(f"Fastqs written in: {(time.process_time() - t)/60:.2f} m")
+    _LOG.debug(f"Contig fastq(s) written in: {(time.process_time() - t)/60:.2f} m")
 
     if options.produce_bam:
         # this will give us the proper read order of the elements, for the sam. They are easier to sort now
@@ -414,5 +441,5 @@ def generate_reads(reference: SeqRecord,
         if options.paired_ended:
             _LOG.debug(f"Properly paired percentage = {len(properly_paired_reads)/len(sam_order)}")
 
-    _LOG.info(f"Finished sampling reads in {(time.process_time() - start)/60:.2f} m")
-    return chrom_fastq_r1, chrom_fastq_r2
+    _LOG.info(f"Finished sampling reads in {(time.time() - start_time)/60:.2f} m")
+    return chrom_fastq_r1_paired, chrom_fastq_r1_single, chrom_fastq_r2_paired, chrom_fastq_r2_single
diff --git a/neat/read_simulator/utils/output_file_writer.py b/neat/read_simulator/utils/output_file_writer.py
index c065ea46..5958486f 100755
--- a/neat/read_simulator/utils/output_file_writer.py
+++ b/neat/read_simulator/utils/output_file_writer.py
@@ -191,68 +191,97 @@ def merge_temp_fastqs(
         will be no IO time.
 
         :param fastq_files: The temporary fastq files to combine in the final output, or to set the order for the
-            final bam
+            final bam. Each set of files has two subsets, the paired and the singles. We'll do all the paired first
+            then append any singletons
         :param paired_ended: whether this run is paired or single ended.
         :param rand_num_gen: the random number generator for the run
         """
-        fastq_indexes = []
-        read1_keys = []
-        read2_keys = []
-
-        # Index the temp fastqs
-        for file_pair in fastq_files:
+        fastq_index_dict = {}
+        # read1_keys will hold all the paired r1 paired ended reads (if any) and all singleton reads
+        paired_keys = []
+        singleton_keys = []
+
+        paired_files = []
+        singleton_files = []
+
+        # First split the files into 2 categories. For paired ended reads, most reads will be paired, though some may
+        # have gotten filtered out with input files. We will append any singletons in paired ended mode at the end,
+        # Or else for single ended we will end up just writing the singletons.
+        for coupled_files in fastq_files:
+            paired_file_names, singleton_file_names = coupled_files
+            paired_files.append(paired_file_names)
+            singleton_files.append(singleton_file_names)
+
+        # Index the temp paired-ended fastqs
+        for file_pair in paired_files:
             file1_index = SeqIO.index(str(file_pair[0]), 'fastq')
-            file2_index = None
-            if file_pair[1]:
-                file2_index = SeqIO.index(str(file_pair[1]), 'fastq')
-                read2_keys.append(list(file2_index))
+            file2_index = SeqIO.index(str(file_pair[1]), 'fastq')
+            # Reconstruct the name of the reads
+            contig_name = Path(file_pair[0]).name.removesuffix('_r1_paired.fq.bgz')
+            # Either both will have data, or neither, so checking one is sufficient
+            if file1_index:
+                if contig_name not in fastq_index_dict:
+                    fastq_index_dict[contig_name] = []
+                # 1 and 2 for read 1 and read 2
+                fastq_index_dict[contig_name] = {1: file1_index, 2: file2_index}
+                paired_keys.extend(list(zip(file1_index, file2_index)))
+
+        # Index the singletons, or for single-ended reads, all reads
+        for file_pair in singleton_files:
+            if file_pair[0]:
+                file_index = SeqIO.index(str(file_pair[0]), 'fastq')
+                contig_name = Path(file_pair[0]).name.strip('_r1_single.fq.bgz')
+            elif file_pair[1]:
+                file_index = SeqIO.index(str(file_pair[1]), 'fastq')
+                contig_name = Path(file_pair[1]).name.strip('_r2_single.fq.bgz')
             else:
-                read2_keys.append(file2_index)
-            fastq_indexes.append((file1_index, file2_index))
-            read1_keys.append(list(file1_index))
-
-        # Flatten keys lists to shuffle the order
-        all_read1_keys = [key for sublist in read1_keys for key in sublist]
-        all_read2_keys = [key for sublist in read2_keys for key in sublist]
-
+                # So singletons for this contig, so move on
+                continue
+
+            # A check in case all reads were properly paired and there are no singletons
+            if file_index:
+                if contig_name not in fastq_index_dict:
+                    fastq_index_dict[contig_name] = []
+                # To keep the data structure consistent, we point both keys at the same file
+                fastq_index_dict[contig_name] = {1: file_index, 2: file_index}
+                singleton_keys.extend(list(file_index))
+
+        shuffled_paired_keys = paired_keys.copy()
+        shuffled_singleton_keys = singleton_keys.copy()
         # Shuffle the keys
-        rand_num_gen.shuffle(all_read1_keys)
-
-        read2_singletons = [key for key in all_read2_keys if key not in all_read1_keys]
+        rand_num_gen.shuffle(shuffled_paired_keys)
+        rand_num_gen.shuffle(shuffled_singleton_keys)
 
+        # So we can delete later
+        wrote_r2 = False
         with (
             open_output(self.fastq1_fn) as fq1,
             open_output(self.fastq2_fn) as fq2
         ):
-            for i in range(len(all_read1_keys)):
-
-                current_key = all_read1_keys[i]
-
-                # This gives us the first index of the 'row' in the keys table where this read is located,
-                #  giving us the list index of the file index where I can find this read
-                current_index = [read1_keys.index(x) for x in read1_keys if current_key in x][0]
-                # This is the index of the read1 file, at the current key, which should be the read we want
-                read1 = fastq_indexes[current_index][0][current_key]
+            # First we add all properly paired reads
+            for i in range(len(shuffled_paired_keys)):
+                current_key = shuffled_paired_keys[i]
+                # reconstruct tho chromosome name
+                chrom_name = current_key[0].removeprefix("NEAT-generated_").split('_')[0]
+                # 1 here because this is read1
+                read1 = fastq_index_dict[chrom_name][1][current_key[0]]
                 SeqIO.write(read1, fq1, 'fastq')
-
-                if current_key in fastq_indexes[current_index][1]:
-                    read2 = fastq_indexes[current_index][1][current_key]
-                    SeqIO.write(read2, fq2, 'fastq')
-
-            for j in range(len(read2_singletons)):
-
-                current_key = read2_singletons[j]
-
-                current_index = [read2_keys.index(x) for x in read2_keys if current_key in x][0]
-                read = fastq_indexes[current_index][1][current_key]
-                SeqIO.write(read, fq2, 'fastq')
-
-        if not paired_ended:
-
+                # 2 for read2
+                read2 = fastq_index_dict[chrom_name][2][current_key[1]]
+                SeqIO.write(read2, fq2, 'fastq')
+                if not wrote_r2:
+                    wrote_r2 = True
+
+            # Next we add the strays (or all reads, for single-ended)
+            for j in range(len(shuffled_singleton_keys)):
+                current_key = shuffled_singleton_keys[j]
+                chrom_name = current_key[0].removeprefix("NEAT_generated_").split('_')[0]
+                read = fastq_index_dict[chrom_name][1][current_key[0]]
+                SeqIO.write(read, fq1, 'fastq')
+
+        if not wrote_r2:
             fastq2_path = Path(self.fastq2_fn)
-
-            if fastq2_path.exists():
-                fastq2_path.unlink()
+            fastq2_path.unlink()
 
     def output_bam_file(self, reads_files: list, contig_dict: dict):
         """
@@ -263,7 +292,7 @@ def output_bam_file(self, reads_files: list, contig_dict: dict):
         :param contig_dict: A dictionary with the keys as contigs from the reference,
             and the values the index of that contig
         """
-
+        # TODO incorporate new read list (no longer a dictionary) from generate_reads pickle file
         bam_out = bgzf.BgzfWriter(self.bam_fn, 'w', compresslevel=BAM_COMPRESSION_LEVEL)
         bam_out.write("BAM\1")
         header = "@HD\tVN:1.4\tSO:coordinate\n"
@@ -284,9 +313,9 @@ def output_bam_file(self, reads_files: list, contig_dict: dict):
 
         for file in reads_files:
             contig_reads_data = pickle.load(gzip.open(file))
-            for key in contig_reads_data:
-                read1 = contig_reads_data[key][0]
-                read2 = contig_reads_data[key][0]
+            for read_data in contig_reads_data:
+                read1 = read_data[0]
+                read2 = read_data[1]
                 if read1:
                     self.write_bam_record(read1, contig_dict[read1.reference_id], bam_out)
 
@@ -317,7 +346,7 @@ def write_bam_record(self, read: Read, contig_id: int, bam_handle: bgzf.BgzfWrit
 
         next_ref_id = contig_id
 
-        if mate_position is None:
+        if not mate_position:
             next_pos = 0
             template_length = 0
         else:
diff --git a/neat/read_simulator/utils/read.py b/neat/read_simulator/utils/read.py
index bebe95ea..70399939 100644
--- a/neat/read_simulator/utils/read.py
+++ b/neat/read_simulator/utils/read.py
@@ -130,7 +130,6 @@ def update_quality_array(
         :param location: The first position of the mutation, in 0-based coordinates
         :param variant_type: Either "mutations" or "errors"
         :param quality_scores: The possible quality scores, used to adjust for mutations and errors
-        :param rng: The random number generator for the run
         :param quality_score: The quality score to use, since this has already been calculated for mutations.
             This must be included if type is 'mutation'
         :return: None, updates the quality array in place
@@ -178,7 +177,7 @@ def apply_errors(self, mutated_sequence: MutableSeq, err_model: SequencingErrorM
                 error.alt,
                 error.location,
                 "error",
-                err_model.quality_scores,
+                list(err_model.quality_scores),
             )
 
         return mutated_sequence
@@ -210,12 +209,7 @@ def apply_mutations(self, mutated_sequence: MutableSeq, quality_scores: list, mu
                     alternate = variant_to_apply.get_alt()
                 elif type(variant_to_apply) == Deletion:
                     reference_length = variant_to_apply.length
-                    try:
-                        alternate = mutated_sequence[position]
-                    except:
-                        print(f"sequence: {mutated_sequence}")
-                        print(f"position: {position}")
-                        raise
+                    alternate = mutated_sequence[position]
                 else:
                     reference_length = variant_to_apply.get_ref_len()
                     alternate = mutated_sequence.get_alt()
@@ -249,7 +243,7 @@ def apply_variants_for_final_output(
 
         rng = mutation_model.rng
         # The read sequence should have some padding still at this point
-        read_sequence = MutableSeq(self.read_sequence.seq)
+        read_sequence = MutableSeq(self.read_sequence)
         # Deal with N's
         for i in range(len(read_sequence)):
             if read_sequence[i] == 'N':
@@ -353,8 +347,13 @@ def make_cigar(self):
 
         # These parameters were set to minimize breaks in the mutated sequence and find the best
         # alignment from there.
+        if self.is_reverse:
+            read_sequence = self.read_sequence.reverse_complement().upper()
+        else:
+            read_sequence = self.read_sequence.upper()
+        reference_segment = self.reference_segment.upper()
         raw_alignment = pairwise2.align.localms(
-            self.reference_segment, self.read_sequence, match=1, mismatch=-1, open=-0.5, extend=-0.1,
+            reference_segment, read_sequence, match=1, mismatch=-1, open=-0.5, extend=-0.1,
             penalize_extend_when_opening=True
         )
 
@@ -366,7 +365,8 @@ def make_cigar(self):
         cig_string = ''
         # Find first match
         start = min([aligned_mut_seq.find(x) for x in ALLOWED_NUCL])
-        for char in range(start, start + len(self.read_sequence)):
+        end = start + len(read_sequence)
+        for char in range(start, end):
             if aligned_template_seq[char] == '-':  # insertion
                 if curr_char == 'I':  # more insertions
                     cig_count = cig_count + 1
@@ -391,7 +391,11 @@ def make_cigar(self):
                         cig_string = cig_string + str(cig_count) + curr_char
                     curr_char = 'M'
                     cig_count = 1
-        return cig_string + str(cig_count) + curr_char
+        # A check to make sure this doesn't end on a D
+        if curr_char == 'D':
+            return cig_string + str(cig_count) + curr_char + str(cig_count) + 'M'
+        else:
+            return cig_string + str(cig_count) + curr_char
 
     def get_mpos(self):
         """

From db2436c123310df000c8b11086efab666e9652b4 Mon Sep 17 00:00:00 2001
From: Joshua Allen <jallen17@illinois.edu>
Date: Thu, 23 May 2024 19:19:58 -0500
Subject: [PATCH 5/8] Added some padding to handle edge case reads at the tail
 end of a chromosome.

---
 neat/models/models.py                         | 12 +++-
 neat/read_simulator/runner.py                 |  2 +-
 neat/read_simulator/utils/generate_reads.py   | 22 +++++--
 .../utils/output_file_writer.py               | 16 +++--
 neat/read_simulator/utils/read.py             | 64 ++++++++++++-------
 5 files changed, 77 insertions(+), 39 deletions(-)

diff --git a/neat/models/models.py b/neat/models/models.py
index e8f37559..258a74f8 100644
--- a/neat/models/models.py
+++ b/neat/models/models.py
@@ -388,16 +388,17 @@ def __init__(self,
 
     def get_sequencing_errors(self,
                               read_length: int,
+                              padding: int,
                               reference_segment: SeqRecord,
                               quality_scores: np.ndarray):
         """
         Inserts errors of type substitution, insertion, or deletion into read_data, and assigns a quality score
         based on the container model.
         :param read_length: The length of the read to generate errors for.
+        :param padding: this is the amount of space we have in the read for deletions.
         :param reference_segment: The section of the reference from which the read is drawn
         :param quality_scores: Array of quality scores for the read
-        :return: modified sequence and associated quality scores
-
+        :return: Modified sequence and associated quality scores
         """
 
         error_indexes = []
@@ -432,6 +433,10 @@ def get_sequencing_errors(self,
             # Deletion error
             if error_type == Deletion:
                 deletion_length = self.get_deletion_length()
+                if padding - deletion_length < 0:
+                    # No more space in this read to add deletions
+                    padding = 0
+                    continue
                 deletion_reference = reference_segment.seq[index: index + deletion_length + 1]
                 deletion_alternate = deletion_reference[0]
                 introduced_errors.append(
@@ -439,6 +444,7 @@ def get_sequencing_errors(self,
                 )
                 num_indels_so_far += deletion_length
                 del_blacklist.extend(list(range(index, index + deletion_length)))
+                padding -= deletion_length
 
             elif error_type == Insertion:
                 insertion_length = self.get_insertion_length()
@@ -467,7 +473,7 @@ def get_sequencing_errors(self,
             if introduced_errors[i].location in del_blacklist:
                 del introduced_errors[i]
 
-        return introduced_errors
+        return introduced_errors, max(padding, 0)
 
     def quality_index_remap(self, input_read_length):
         """
diff --git a/neat/read_simulator/runner.py b/neat/read_simulator/runner.py
index 0c6cd447..7b408944 100644
--- a/neat/read_simulator/runner.py
+++ b/neat/read_simulator/runner.py
@@ -371,4 +371,4 @@ def read_simulator_runner(config: str, output: str):
         _LOG.info(f"Outputting golden bam file: {str(output_file_writer.bam_fn)}")
         contig_list = list(reference_index)
         contigs_by_index = {contig_list[n]: n for n in range(len(contig_list))}
-        output_file_writer.output_bam_file(sam_reads_files, contigs_by_index)
+        output_file_writer.output_bam_file(sam_reads_files, contigs_by_index, options.read_len)
diff --git a/neat/read_simulator/utils/generate_reads.py b/neat/read_simulator/utils/generate_reads.py
index c158f87c..05a4f2c6 100644
--- a/neat/read_simulator/utils/generate_reads.py
+++ b/neat/read_simulator/utils/generate_reads.py
@@ -358,8 +358,10 @@ def generate_reads(reference: SeqRecord,
                 else:
                     properly_paired = True
 
-            read_name = f'{base_name}_{str(i)}'
+            read_name = f'{base_name}_{str(i+1)}'
 
+            # Initialize variable to track padding
+            padding = 0
             # If the other read is marked as a singleton, then this one was filtered out, or these are single-ended
             if not read2_is_singleton:
                 # It's properly paired if it's not a singleton
@@ -368,8 +370,9 @@ def generate_reads(reference: SeqRecord,
                 # 30 is basically arbitrary. This may be a function of read length or determinable?
                 # Though we don't figure out the variants until later
 
-                # this start check ensures that we get a valid segment
-                segment = reference[read1[0]: read1[1] + 30].seq
+                # upper ensures that we get a segment with NEAT-recognized bases
+                segment = reference[read1[0]: read1[1] + 30].seq.upper()
+                padding = len(segment) - options.read_len
 
                 read_1 = Read(
                     name=read_name + "/1",
@@ -378,6 +381,7 @@ def generate_reads(reference: SeqRecord,
                     reference_id=reference.id,
                     position=read1[0] + ref_start,
                     end_point=read1[1] + ref_start,
+                    padding=padding,
                     quality_offset=options.quality_offset,
                     is_paired=is_paired
                 )
@@ -396,9 +400,14 @@ def generate_reads(reference: SeqRecord,
                 # 30 is arbitrary. This may be a function of read length or determinable?
                 # Though we don't figure out the variants until later
 
-                # this start check ensures that we get a valid segment
-                start_cooridnate = max((read2[0] - 30), 0)
-                segment = reference[start_cooridnate: read2[1]].seq
+                # If we had a certain amount of padding from read_1, we'll use that here
+                if padding:
+                    start_coordinate = max((read2[0] - padding), 0)
+                else:
+                    start_coordinate = max((read2[0] - 30), 0)
+                # upper ensures that we get a segment with NEAT-recognized bases
+                segment = reference[start_coordinate: read2[1]].seq.upper()
+                padding = len(segment) - options.read_len
 
                 read_2 = Read(
                     name=read_name + "/2",
@@ -407,6 +416,7 @@ def generate_reads(reference: SeqRecord,
                     reference_id=reference.id,
                     position=read2[0] + ref_start,
                     end_point=read2[1] + ref_start,
+                    padding=padding,
                     quality_offset=options.quality_offset,
                     is_reverse=True,
                     is_paired=is_paired
diff --git a/neat/read_simulator/utils/output_file_writer.py b/neat/read_simulator/utils/output_file_writer.py
index 5958486f..93a87912 100755
--- a/neat/read_simulator/utils/output_file_writer.py
+++ b/neat/read_simulator/utils/output_file_writer.py
@@ -283,7 +283,7 @@ def merge_temp_fastqs(
             fastq2_path = Path(self.fastq2_fn)
             fastq2_path.unlink()
 
-    def output_bam_file(self, reads_files: list, contig_dict: dict):
+    def output_bam_file(self, reads_files: list, contig_dict: dict, read_length: int):
         """
         This section is for producing a CIGAR string using a temp sam file(sam file with
         original sequence instead of a cigar string)
@@ -291,6 +291,7 @@ def output_bam_file(self, reads_files: list, contig_dict: dict):
         :param reads_files: The list of temp sams to combine
         :param contig_dict: A dictionary with the keys as contigs from the reference,
             and the values the index of that contig
+        :param read_length: the length of the reads for this run
         """
         # TODO incorporate new read list (no longer a dictionary) from generate_reads pickle file
         bam_out = bgzf.BgzfWriter(self.bam_fn, 'w', compresslevel=BAM_COMPRESSION_LEVEL)
@@ -317,19 +318,20 @@ def output_bam_file(self, reads_files: list, contig_dict: dict):
                 read1 = read_data[0]
                 read2 = read_data[1]
                 if read1:
-                    self.write_bam_record(read1, contig_dict[read1.reference_id], bam_out)
+                    self.write_bam_record(read1, contig_dict[read1.reference_id], bam_out, read_length)
 
                 if read2:
-                    self.write_bam_record(read2, contig_dict[read2.reference_id], bam_out)
+                    self.write_bam_record(read2, contig_dict[read2.reference_id], bam_out, read_length)
         bam_out.close()
 
-    def write_bam_record(self, read: Read, contig_id: int, bam_handle: bgzf.BgzfWriter):
+    def write_bam_record(self, read: Read, contig_id: int, bam_handle: bgzf.BgzfWriter, read_length: int):
         """
         Takes a read object and writes it out as a bam record
 
         :param read: a read object containing everything we need to write it out.
         :param contig_id: the index of the reference for this
         :param bam_handle: the handle of the file object to write to.
+        :param read_length: the length of the read to output
         """
         read_bin = reg2bin(read.position, read.end_point)
 
@@ -356,8 +358,8 @@ def write_bam_record(self, read: Read, contig_id: int, bam_handle: bgzf.BgzfWrit
         for i in range(cig_ops):
             encoded_cig.extend(pack('<I', (cig_numbers[i] << 4) + CIGAR_PACKED[cig_letters[i]]))
         encoded_seq = bytearray()
-        encoded_len = (len(alt_sequence) + 1) // 2
-        seq_len = len(alt_sequence)
+        encoded_len = (read_length + 1) // 2
+        seq_len = read_length
         if seq_len & 1:
             alt_sequence += '='
         for i in range(encoded_len):
@@ -369,7 +371,7 @@ def write_bam_record(self, read: Read, contig_id: int, bam_handle: bgzf.BgzfWrit
                      SEQ_PACKED[alt_sequence[2 * i + 1].capitalize()]))
 
         # apparently samtools automatically adds 33 to the quality score string...
-        encoded_qual = ''.join([chr(ord(n) - 33) for n in read.read_quality_string])
+        encoded_qual = ''.join([chr(ord(n) - 33) for n in read.read_quality_string[:read_length]])
 
         """
         block_size = 4 +		# refID 		int32
diff --git a/neat/read_simulator/utils/read.py b/neat/read_simulator/utils/read.py
index 70399939..f55c188c 100644
--- a/neat/read_simulator/utils/read.py
+++ b/neat/read_simulator/utils/read.py
@@ -35,6 +35,7 @@ class Read:
     :param reference_id: the ID for the reference where the segment is drawn from
     :param position: First position of the read, relative to the reference
     :param end_point: End point of the read, relative to the reference
+    :param padding: The amount of padding we have for this read, will influence if we can add deletions
     :param is_reverse: Whether the read is reversed
     :param is_paired: Whether this read has a proper pair.
     """
@@ -45,6 +46,7 @@ def __init__(self,
                  reference_id: str,
                  position: int,
                  end_point: int,
+                 padding: int,
                  quality_offset: int,
                  is_reverse: bool = False,
                  is_paired: bool = False):
@@ -55,6 +57,7 @@ def __init__(self,
         self.reference_id = reference_id
         self.position = position
         self.end_point = end_point
+        self.padding = padding
         self.quality_offset = quality_offset
         self.length = end_point - position
         self.is_reverse = is_reverse
@@ -209,13 +212,22 @@ def apply_mutations(self, mutated_sequence: MutableSeq, quality_scores: list, mu
                     alternate = variant_to_apply.get_alt()
                 elif type(variant_to_apply) == Deletion:
                     reference_length = variant_to_apply.length
+                    if self.padding - variant_to_apply.length < 0:
+                        # Skip this deletion, as there is insufficient space
+                        self.padding = 0
+                        continue
+                    else:
+                        self.padding -= variant_to_apply.length
                     alternate = mutated_sequence[position]
                 else:
                     reference_length = variant_to_apply.get_ref_len()
                     alternate = mutated_sequence.get_alt()
 
                 # Replace the entire ref with the entire alt
-                mutated_sequence = mutated_sequence[:position] + alternate + mutated_sequence[position + reference_length:]
+                mutated_sequence = \
+                    mutated_sequence[:position] + \
+                    alternate + \
+                    mutated_sequence[position + reference_length:]
 
                 self.update_quality_array(
                     self.reference_segment[location: location+reference_length],
@@ -252,11 +264,12 @@ def apply_variants_for_final_output(
                 self.quality_array[i] = min(error_model.quality_scores)
 
         if self.mutations:
-            read_sequence = self.apply_mutations(read_sequence, list(error_model.quality_scores), mutation_model)
+            read_sequence = self.apply_mutations(
+                read_sequence, list(error_model.quality_scores), mutation_model
+            )
         if self.errors:
             read_sequence = self.apply_errors(read_sequence, error_model)
 
-        read_sequence = read_sequence[:self.length]
         self.quality_array = self.quality_array[:self.length]
 
         return Seq(read_sequence)
@@ -312,18 +325,18 @@ def finalize_read_and_write(
         """
 
         # Generate quality scores to cover the extended segment. We'll trim later
-        self.quality_array = err_model.get_quality_scores(len(self.reference_segment))
+        raw_quality_array = err_model.get_quality_scores(len(self.reference_segment))
 
         if self.is_reverse:
-            read_quality_array = self.quality_array[::-1]
+            read_quality_array = raw_quality_array[::-1]
             read_sequence = self.reference_segment.reverse_complement()
         else:
-            read_quality_array = self.quality_array
+            read_quality_array = raw_quality_array
             read_sequence = self.reference_segment
 
-        # Get errors for the read and update the quality score
-        self.errors = err_model.get_sequencing_errors(self.length, read_sequence,
-                                                      read_quality_array)
+        # Get errors for the rea and update the quality score
+        self.errors, self.padding = err_model.get_sequencing_errors(self.length, self.padding, read_sequence,
+                                                                    read_quality_array)
 
         # Update the read as needed:
         self.quality_array = read_quality_array
@@ -332,11 +345,11 @@ def finalize_read_and_write(
         self.read_sequence = self.apply_variants_for_final_output(err_model, mut_model)
 
         self.read_quality_string = "".join([chr(x + self.quality_offset) for x in self.quality_array])
-        self.mapping_quality = err_model.rng.poisson(70)
+        self.mapping_quality = abs(err_model.rng.poisson(70))
 
         if produce_fastq:
             fastq_handle.write(f'@{self.name}\n')
-            fastq_handle.write(f'{str(self.read_sequence)}\n')
+            fastq_handle.write(f'{str(self.read_sequence[:self.length])}\n')
             fastq_handle.write('+\n')
             fastq_handle.write(f'{self.read_quality_string}\n')
 
@@ -347,6 +360,8 @@ def make_cigar(self):
 
         # These parameters were set to minimize breaks in the mutated sequence and find the best
         # alignment from there.
+
+        # We're ignoring soft-masking (lower case letters)
         if self.is_reverse:
             read_sequence = self.read_sequence.reverse_complement().upper()
         else:
@@ -361,29 +376,31 @@ def make_cigar(self):
         aligned_template_seq = alignment[0]
         aligned_mut_seq = alignment[-2]
         cig_count = 0
+        cig_length = 0
         curr_char = ''
         cig_string = ''
         # Find first match
-        start = min([aligned_mut_seq.find(x) for x in ALLOWED_NUCL])
-        end = start + len(read_sequence)
-        for char in range(start, end):
+        for char in range(len(read_sequence)):
             if aligned_template_seq[char] == '-':  # insertion
                 if curr_char == 'I':  # more insertions
-                    cig_count = cig_count + 1
+                    cig_count += 1
+                    cig_length += 1
                 else:  # new insertion
                     cig_string = cig_string + str(cig_count) + curr_char
                     curr_char = 'I'
                     cig_count = 1
+                    cig_length += 1
             elif aligned_mut_seq[char] == '-':  # deletion
                 if curr_char == 'D':  # more deletions
-                    cig_count = cig_count + 1
+                    cig_count += 1
                 else:  # new deletion
                     cig_string = cig_string + str(cig_count) + curr_char
                     curr_char = 'D'
                     cig_count = 1
             else:  # match
                 if curr_char == 'M':  # more matches
-                    cig_count = cig_count + 1
+                    cig_count += 1
+                    cig_length += 1
                 else:  # new match
                     # If there is anything before this, add it to the string and increment,
                     # else, just increment
@@ -391,11 +408,14 @@ def make_cigar(self):
                         cig_string = cig_string + str(cig_count) + curr_char
                     curr_char = 'M'
                     cig_count = 1
-        # A check to make sure this doesn't end on a D
-        if curr_char == 'D':
-            return cig_string + str(cig_count) + curr_char + str(cig_count) + 'M'
-        else:
-            return cig_string + str(cig_count) + curr_char
+                    cig_length += 1
+            if cig_length == self.length:
+                break
+
+        if cig_length < self.length:
+            raise ValueError("Problem creating cigar string")
+        # append the final section as we return
+        return cig_string + str(cig_count) + curr_char
 
     def get_mpos(self):
         """

From c0130939e01a139874873db9122def06208d5263 Mon Sep 17 00:00:00 2001
From: Joshua Allen <jallen17@illinois.edu>
Date: Thu, 23 May 2024 22:02:21 -0500
Subject: [PATCH 6/8] Working on ironing out same variables

---
 neat/models/models.py                       |   3 -
 neat/read_simulator/utils/generate_reads.py |  27 +----
 neat/read_simulator/utils/read.py           | 103 ++++++++++++--------
 3 files changed, 66 insertions(+), 67 deletions(-)

diff --git a/neat/models/models.py b/neat/models/models.py
index 258a74f8..8d61b4db 100644
--- a/neat/models/models.py
+++ b/neat/models/models.py
@@ -418,9 +418,6 @@ def get_sequencing_errors(self,
         del_blacklist = []
 
         for index in error_indexes[::-1]:
-            # Skip any N's
-            if reference_segment[index] == 'N':
-                continue
             # determine error type. Most will be SNVs
             error_type = SingleNucleotideVariant
 
diff --git a/neat/read_simulator/utils/generate_reads.py b/neat/read_simulator/utils/generate_reads.py
index 05a4f2c6..6f5165cd 100644
--- a/neat/read_simulator/utils/generate_reads.py
+++ b/neat/read_simulator/utils/generate_reads.py
@@ -154,26 +154,6 @@ def find_applicable_mutations(my_read: Read, all_variants: ContigVariants) -> di
     return return_dict
 
 
-def replace_n(segment: Seq, rng: Generator) -> Seq:
-    """
-    Replaces invalid characters with random valid characters
-
-    :param segment: A Seq object containing a DNA sequence
-    :param rng: The random number generator for the run
-    :return: The modified sequence object
-    """
-    modified_segment = ""
-    # This takes care of soft masking
-    segment_no_mask = segment.upper()
-    for base in segment_no_mask:
-        if base in ALLOWED_NUCL:
-            modified_segment += base
-        else:
-            modified_segment += rng.choice(ALLOWED_NUCL)
-
-    return Seq(modified_segment)
-
-
 def merge_sort(my_array: np.ndarray):
     """
     This sorts the reads in reverse position, merging as it goes, in order to get the proper order
@@ -369,9 +349,8 @@ def generate_reads(reference: SeqRecord,
                 # add a small amount of padding to the end to account for deletions
                 # 30 is basically arbitrary. This may be a function of read length or determinable?
                 # Though we don't figure out the variants until later
+                segment = reference[read1[0]: read1[1] + 30].seq
 
-                # upper ensures that we get a segment with NEAT-recognized bases
-                segment = reference[read1[0]: read1[1] + 30].seq.upper()
                 padding = len(segment) - options.read_len
 
                 read_1 = Read(
@@ -405,8 +384,8 @@ def generate_reads(reference: SeqRecord,
                     start_coordinate = max((read2[0] - padding), 0)
                 else:
                     start_coordinate = max((read2[0] - 30), 0)
-                # upper ensures that we get a segment with NEAT-recognized bases
-                segment = reference[start_coordinate: read2[1]].seq.upper()
+                # this ensures that we get a segment with NEAT-recognized bases
+                segment = reference[start_coordinate: read2[1]].seq
                 padding = len(segment) - options.read_len
 
                 read_2 = Read(
diff --git a/neat/read_simulator/utils/read.py b/neat/read_simulator/utils/read.py
index f55c188c..8533ab63 100644
--- a/neat/read_simulator/utils/read.py
+++ b/neat/read_simulator/utils/read.py
@@ -162,18 +162,18 @@ def update_quality_array(
                             np.array(new_quality_score),
                             self.quality_array[location+len(reference):]))
 
-    def apply_errors(self, mutated_sequence: MutableSeq, err_model: SequencingErrorModel):
+    def apply_errors(self, err_model: SequencingErrorModel):
         """
         This function applies errors to a sequence and calls the update_quality_array function after
 
-        :param mutated_sequence: The sequence to add errors to.
         :param err_model: The error model for this run,
         :return: None, The sequence, with errors applied
         """
+        mutated_sequence = ""
         for error in self.errors:
             # Replace the entire ref sequence with the entire alt sequence
             mutated_sequence = \
-                mutated_sequence[:error.location] + error.alt + mutated_sequence[error.location+len(error.ref):]
+                self.read_sequence[:error.location] + error.alt + self.read_sequence[error.location+len(error.ref):]
             # update quality score for error
             self.update_quality_array(
                 error.ref,
@@ -183,19 +183,18 @@ def apply_errors(self, mutated_sequence: MutableSeq, err_model: SequencingErrorM
                 list(err_model.quality_scores),
             )
 
-        return mutated_sequence
+        self.read_sequence = Seq(mutated_sequence)
 
-    def apply_mutations(self, mutated_sequence: MutableSeq, quality_scores: list, mut_model: MutationModel):
+    def apply_mutations(self, quality_scores: list, mut_model: MutationModel):
         """
         Applying mutations involves one extra step, because of polyploidism. There may be more than one mutation
         at a given location, so it is formulated as a list. We then pick one at random for this read.
 
-        :param mutated_sequence: The sequence we are applying the variant to
         :param quality_scores: The possible quality scores for this run (to update quality scores)
         :param mut_model: The mutation model for the run, used for the zygosity and for the rng
         :return: mutated sequence, with mutations applied
         """
-
+        mutated_sequence = ""
         for location in self.mutations:
             variant_to_apply = mut_model.rng.choice(self.mutations[location])
 
@@ -218,10 +217,10 @@ def apply_mutations(self, mutated_sequence: MutableSeq, quality_scores: list, mu
                         continue
                     else:
                         self.padding -= variant_to_apply.length
-                    alternate = mutated_sequence[position]
+                    alternate = self.read_sequence[position]
                 else:
                     reference_length = variant_to_apply.get_ref_len()
-                    alternate = mutated_sequence.get_alt()
+                    alternate = self.read_sequence.get_alt()
 
                 # Replace the entire ref with the entire alt
                 mutated_sequence = \
@@ -244,7 +243,7 @@ def apply_variants_for_final_output(
             self,
             error_model: SequencingErrorModel,
             mutation_model: MutationModel
-    ) -> Seq:
+    ):
         """
         Gets mutated sequence to output for fastq/bam
 
@@ -252,27 +251,19 @@ def apply_variants_for_final_output(
         :param mutation_model: The mutation model for the run
         :return: the mutated sequence
         """
-
-        rng = mutation_model.rng
-        # The read sequence should have some padding still at this point
-        read_sequence = MutableSeq(self.read_sequence)
-        # Deal with N's
-        for i in range(len(read_sequence)):
-            if read_sequence[i] == 'N':
-                # for now replace 'N' masked regions with pure noise. We may refine this in time
-                read_sequence[i] = rng.choice(['A', 'C', 'G', 'T'])
-                self.quality_array[i] = min(error_model.quality_scores)
-
+        # I realize that it's a little weird to convert self.read_sequence to mutable then assign to a variable
+        # instead of operating on it in place, but while it is mutable, it was easier to treat it like a variable then
+        # reconvert and reassign.
         if self.mutations:
-            read_sequence = self.apply_mutations(
-                read_sequence, list(error_model.quality_scores), mutation_model
-            )
+            self.apply_mutations(list(error_model.quality_scores), mutation_model)
         if self.errors:
-            read_sequence = self.apply_errors(read_sequence, error_model)
+            self.apply_errors(error_model)
 
+        # The segments we hold off truncating because we need them for alignments,
+        # but the quality array is fine to trim here.
         self.quality_array = self.quality_array[:self.length]
-
-        return Seq(read_sequence)
+        # Convert back to immutable
+        self.read_sequence = Seq(self.read_sequence)
 
     def contains(self, test_pos: int):
         return self.position <= test_pos < self.end_point
@@ -325,24 +316,21 @@ def finalize_read_and_write(
         """
 
         # Generate quality scores to cover the extended segment. We'll trim later
-        raw_quality_array = err_model.get_quality_scores(len(self.reference_segment))
+        self.quality_array = err_model.get_quality_scores(len(self.reference_segment))
 
-        if self.is_reverse:
-            read_quality_array = raw_quality_array[::-1]
-            read_sequence = self.reference_segment.reverse_complement()
-        else:
-            read_quality_array = raw_quality_array
-            read_sequence = self.reference_segment
+        # This replaces either hard or soft-masked reference segment with upper case or a standard repeat
+        # It updates the quality array and reference segment in place, including reversing them, if appropriate
+        self.convert_masking(err_model)
 
         # Get errors for the rea and update the quality score
-        self.errors, self.padding = err_model.get_sequencing_errors(self.length, self.padding, read_sequence,
-                                                                    read_quality_array)
+        self.errors, self.padding = err_model.get_sequencing_errors(self.length, self.padding, self.reference_segment,
+                                                                    self.quality_array)
 
-        # Update the read as needed:
-        self.quality_array = read_quality_array
-        self.read_sequence = read_sequence
+        # Update the read sequence to match the reference
+        self.read_sequence = self.reference_segment
 
-        self.read_sequence = self.apply_variants_for_final_output(err_model, mut_model)
+        # This applies any variants, updates quality score and read sequence in place
+        self.apply_variants_for_final_output(err_model, mut_model)
 
         self.read_quality_string = "".join([chr(x + self.quality_offset) for x in self.quality_array])
         self.mapping_quality = abs(err_model.rng.poisson(70))
@@ -353,6 +341,41 @@ def finalize_read_and_write(
             fastq_handle.write('+\n')
             fastq_handle.write(f'{self.read_quality_string}\n')
 
+    def convert_masking(self, error_model: SequencingErrorModel):
+        """
+        Replaces invalid characters with random valid characters drawn from a standard repeat sequence seen in a lot
+        of different species (TTAGGG). If there is call for it, we can make this customizable to species.
+
+        :param error_model: The error model for this run
+        :return: The modified sequence object
+        """
+        bad_score = min(error_model.quality_scores)
+        # we'll use generic human repeats, as commonly found in masked regions. We may refine this to make configurable
+        repeat_bases = list("TTAGGG")
+        modified_segment = ""
+        modified_quality_array = self.quality_array.copy()
+        if self.is_reverse:
+            raw_segment = self.reference_segment.reverse_complement()
+            modified_quality_array = modified_quality_array[::-1]
+        else:
+            raw_segment = self.reference_segment
+
+        for i in range(len(raw_segment)):
+            base = raw_segment[i]
+            if base.islower():
+                modified_base = base.upper()
+                modified_quality_array[i] = bad_score
+            else:
+                modified_base = base
+            if modified_base in ALLOWED_NUCL:
+                modified_segment += modified_base
+            else:
+                modified_segment += repeat_bases[i % 6]
+                modified_quality_array[i] = bad_score
+
+        self.reference_segment = Seq(modified_segment)
+        self.quality_array = modified_quality_array
+
     def make_cigar(self):
         """
         Aligns the reference and mutated sequences.

From 0e11d20757032c7ed18e20e92100b81738282887 Mon Sep 17 00:00:00 2001
From: Joshua Allen <jallen17@illinois.edu>
Date: Fri, 24 May 2024 21:12:54 -0500
Subject: [PATCH 7/8] Okay, I found a few more bugs related to N regionsn. Not
 bugs, exactly, but edge cases dealing with masked bases in a fasta file. I
 solved those issues and need to test, but it is looking much more promising
 than when I started the day.

---
 neat/models/models.py                         |  12 +-
 neat/read_simulator/utils/generate_reads.py   |  42 +++---
 .../utils/output_file_writer.py               |  20 ++-
 neat/read_simulator/utils/read.py             | 130 ++++++++++--------
 4 files changed, 112 insertions(+), 92 deletions(-)

diff --git a/neat/models/models.py b/neat/models/models.py
index 8d61b4db..c66969fa 100644
--- a/neat/models/models.py
+++ b/neat/models/models.py
@@ -413,7 +413,7 @@ def get_sequencing_errors(self,
                 if self.rng.random() < self.quality_score_error_rate[quality_scores[i]]:
                     error_indexes.append(i)
 
-        num_indels_so_far = 0
+        total_indel_length = 0
         # To prevent deletion collisions
         del_blacklist = []
 
@@ -424,22 +424,22 @@ def get_sequencing_errors(self,
             # Not too sure about how realistic it is to model errors as indels, but I'm leaving the code in for now.
 
             # This is to prevent deletion error collisions and to keep there from being too many indel errors.
-            if 0 < index < self.read_length - max(self.deletion_len_model) and num_indels_so_far > self.read_length//2:
+            if 0 < index < self.read_length - max(self.deletion_len_model) and total_indel_length > self.read_length//4:
                 error_type = self.rng.choice(a=list(self.variant_probs), p=list(self.variant_probs.values()))
 
             # Deletion error
             if error_type == Deletion:
                 deletion_length = self.get_deletion_length()
                 if padding - deletion_length < 0:
-                    # No more space in this read to add deletions
-                    padding = 0
+                    # No space in this read to add this deletion
                     continue
                 deletion_reference = reference_segment.seq[index: index + deletion_length + 1]
                 deletion_alternate = deletion_reference[0]
                 introduced_errors.append(
                     ErrorContainer(Deletion, index, deletion_length, deletion_reference, deletion_alternate)
                 )
-                num_indels_so_far += deletion_length
+                total_indel_length += deletion_length
+
                 del_blacklist.extend(list(range(index, index + deletion_length)))
                 padding -= deletion_length
 
@@ -451,7 +451,7 @@ def get_sequencing_errors(self,
                 introduced_errors.append(
                     ErrorContainer(Insertion, index, insertion_length, insertion_reference, insertion_alternate)
                 )
-                num_indels_so_far += insertion_length
+                total_indel_length += insertion_length
 
             # Insert substitution error
             # Programmer note: if you add new error types, they can be added as elifs above, leaving the final
diff --git a/neat/read_simulator/utils/generate_reads.py b/neat/read_simulator/utils/generate_reads.py
index 6f5165cd..95f45b77 100644
--- a/neat/read_simulator/utils/generate_reads.py
+++ b/neat/read_simulator/utils/generate_reads.py
@@ -340,18 +340,17 @@ def generate_reads(reference: SeqRecord,
 
             read_name = f'{base_name}_{str(i+1)}'
 
-            # Initialize variable to track padding
-            padding = 0
             # If the other read is marked as a singleton, then this one was filtered out, or these are single-ended
             if not read2_is_singleton:
                 # It's properly paired if it's not a singleton
                 is_paired = not read1_is_singleton
-                # add a small amount of padding to the end to account for deletions
-                # 30 is basically arbitrary. This may be a function of read length or determinable?
-                # Though we don't figure out the variants until later
-                segment = reference[read1[0]: read1[1] + 30].seq
+                # add a small amount of padding to the end to account for deletions.
+                # Trying out this method of using the read-length, which for the default neat run gives ~30.
+                padding = options.read_len//5
+                segment = reference[read1[0]: read1[1] + padding].seq
 
-                padding = len(segment) - options.read_len
+                # if we're at the end of the contig, this may not pick up the full padding
+                actual_padding = len(segment) - options.read_len
 
                 read_1 = Read(
                     name=read_name + "/1",
@@ -360,8 +359,7 @@ def generate_reads(reference: SeqRecord,
                     reference_id=reference.id,
                     position=read1[0] + ref_start,
                     end_point=read1[1] + ref_start,
-                    padding=padding,
-                    quality_offset=options.quality_offset,
+                    padding=actual_padding,
                     is_paired=is_paired
                 )
 
@@ -370,23 +368,20 @@ def generate_reads(reference: SeqRecord,
                     handle = fq1_paired
                 else:
                     handle = fq1_single
-                read_1.finalize_read_and_write(error_model_1, mutation_model, handle, options.produce_fastq)
+                read_1.finalize_read_and_write(
+                    error_model_1, mutation_model, handle, options.quality_offset, options.produce_fastq
+                )
 
             # if read1 is a sinleton then these are single-ended reads or this one was filtered out, se we skip
             if not read1_is_singleton:
                 is_paired = not read2_is_singleton
-                # Because this segment reads back to front, we need padding at the beginning.
-                # 30 is arbitrary. This may be a function of read length or determinable?
-                # Though we don't figure out the variants until later
-
-                # If we had a certain amount of padding from read_1, we'll use that here
-                if padding:
-                    start_coordinate = max((read2[0] - padding), 0)
-                else:
-                    start_coordinate = max((read2[0] - 30), 0)
+                # Padding, as above
+                padding = options.read_len//5
+                start_coordinate = max((read2[0] - padding), 0)
                 # this ensures that we get a segment with NEAT-recognized bases
                 segment = reference[start_coordinate: read2[1]].seq
-                padding = len(segment) - options.read_len
+                # See note above
+                actual_padding = len(segment) - options.read_len
 
                 read_2 = Read(
                     name=read_name + "/2",
@@ -395,8 +390,7 @@ def generate_reads(reference: SeqRecord,
                     reference_id=reference.id,
                     position=read2[0] + ref_start,
                     end_point=read2[1] + ref_start,
-                    padding=padding,
-                    quality_offset=options.quality_offset,
+                    padding=actual_padding,
                     is_reverse=True,
                     is_paired=is_paired
                 )
@@ -406,7 +400,9 @@ def generate_reads(reference: SeqRecord,
                     handle = fq2_paired
                 else:
                     handle = fq2_single
-                read_2.finalize_read_and_write(error_model_2, mutation_model, handle, options.produce_fastq)
+                read_2.finalize_read_and_write(
+                    error_model_2, mutation_model, handle, options.quality_offset, options.produce_fastq
+                )
 
             if properly_paired:
                 properly_paired_reads.append((read_1, read_2))
diff --git a/neat/read_simulator/utils/output_file_writer.py b/neat/read_simulator/utils/output_file_writer.py
index 93a87912..365a77ba 100755
--- a/neat/read_simulator/utils/output_file_writer.py
+++ b/neat/read_simulator/utils/output_file_writer.py
@@ -293,7 +293,6 @@ def output_bam_file(self, reads_files: list, contig_dict: dict, read_length: int
             and the values the index of that contig
         :param read_length: the length of the reads for this run
         """
-        # TODO incorporate new read list (no longer a dictionary) from generate_reads pickle file
         bam_out = bgzf.BgzfWriter(self.bam_fn, 'w', compresslevel=BAM_COMPRESSION_LEVEL)
         bam_out.write("BAM\1")
         header = "@HD\tVN:1.4\tSO:coordinate\n"
@@ -312,16 +311,26 @@ def output_bam_file(self, reads_files: list, contig_dict: dict, read_length: int
             bam_out.write(f'{item}\0')
             bam_out.write(pack('<i', self.bam_header[item]))
 
+        n_to_crap_correlation = 0
+        total_crap = 0
         for file in reads_files:
             contig_reads_data = pickle.load(gzip.open(file))
             for read_data in contig_reads_data:
                 read1 = read_data[0]
                 read2 = read_data[1]
+                is_correlated = False
+                is_crap = False
                 if read1:
-                    self.write_bam_record(read1, contig_dict[read1.reference_id], bam_out, read_length)
-
+                    is_correlated, is_crap = self.write_bam_record(read1, contig_dict[read1.reference_id], bam_out, read_length)
                 if read2:
-                    self.write_bam_record(read2, contig_dict[read2.reference_id], bam_out, read_length)
+                    is_correlated, is_crap = self.write_bam_record(read2, contig_dict[read2.reference_id], bam_out, read_length)
+                if is_correlated:
+                    n_to_crap_correlation += 1
+                    total_crap += 1
+                elif is_crap:
+                    total_crap += 1
+        if total_crap:
+            print(f"n-to-crap correlation: {n_to_crap_correlation/total_crap:.2%}")
         bam_out.close()
 
     def write_bam_record(self, read: Read, contig_id: int, bam_handle: bgzf.BgzfWriter, read_length: int):
@@ -340,7 +349,7 @@ def write_bam_record(self, read: Read, contig_id: int, bam_handle: bgzf.BgzfWrit
         template_length = read.get_tlen()
         alt_sequence = read.read_sequence
 
-        cigar = read.make_cigar()
+        cigar, n_corr, crap = read.make_cigar()
 
         cig_letters = re.split(r"\d+", cigar)[1:]
         cig_numbers = [int(n) for n in re.findall(r"\d+", cigar)]
@@ -406,3 +415,4 @@ def write_bam_record(self, read: Read, contig_id: int, bam_handle: bgzf.BgzfWrit
                           encoded_cig +
                           encoded_seq +
                           encoded_qual.encode('utf-8')))
+        return n_corr, crap
diff --git a/neat/read_simulator/utils/read.py b/neat/read_simulator/utils/read.py
index 8533ab63..b0ec5bc8 100644
--- a/neat/read_simulator/utils/read.py
+++ b/neat/read_simulator/utils/read.py
@@ -10,12 +10,10 @@
 import logging
 import numpy as np
 
-from bisect import bisect_right
 from typing import TextIO
 from Bio.Seq import Seq, MutableSeq
 from Bio import pairwise2
 from Bio.pairwise2 import format_alignment
-from numpy.random import Generator
 
 from ...common import ALLOWED_NUCL
 from ...models import SequencingErrorModel, ErrorContainer, MutationModel
@@ -47,18 +45,17 @@ def __init__(self,
                  position: int,
                  end_point: int,
                  padding: int,
-                 quality_offset: int,
                  is_reverse: bool = False,
                  is_paired: bool = False):
 
         self.name = name
         self.raw_read = raw_read
+        # This segment will gold the raw read and be untouched.
         self.reference_segment = reference_segment
         self.reference_id = reference_id
         self.position = position
         self.end_point = end_point
         self.padding = padding
-        self.quality_offset = quality_offset
         self.length = end_point - position
         self.is_reverse = is_reverse
         self.is_paired = is_paired
@@ -71,6 +68,7 @@ def __init__(self,
         self.quality_array: np.ndarray = np.zeros(self.length)  # this will have the correct memory length
         self.mapping_quality: int = 0  # initialize at 0
         self.read_quality_string: str = ""  # This will hold the read quality string
+        self.num_ns = 0
 
     def __repr__(self):
         return f"{self.reference_id}: {self.position}-{self.end_point}"
@@ -119,7 +117,7 @@ def __len__(self):
 
     def update_quality_array(
             self,
-            reference: Seq,
+            ref_length: int,
             alternate: Seq,
             location: int,
             variant_type: str,
@@ -128,7 +126,7 @@ def update_quality_array(
     ):
         """
         This updates the quality score based on the error model. Uniform for mutations, random (but low) for errors
-        :param reference: The reference sequence for the variant
+        :param ref_length: The length of the reference sequence for the variant
         :param alternate: The alternate sequence for the variant
         :param location: The first position of the mutation, in 0-based coordinates
         :param variant_type: Either "mutations" or "errors"
@@ -141,8 +139,6 @@ def update_quality_array(
             new_quality_score = [quality_score] * len(alternate)
         else:
             # Since we have an error here, we'll choose a min score
-            original_quality_score = self.quality_array[location: location+len(reference)]
-            new_quality_score = original_quality_score.copy()
             low_score = min(quality_scores)
             # If is insertion
             if len(alternate) > 1:
@@ -150,7 +146,7 @@ def update_quality_array(
                 # Original ref is unaffected, so it's quality score remains the same
                 new_quality_score = [low_score] * (len(alternate) - 1)
             # If is deletion
-            elif len(reference) > 1 and len(alternate) == 1:
+            elif ref_length > 1 and len(alternate) == 1:
                 new_quality_score = []
             # SNP
             else:
@@ -160,7 +156,7 @@ def update_quality_array(
         self.quality_array = \
             np.concatenate((self.quality_array[:location],
                             np.array(new_quality_score),
-                            self.quality_array[location+len(reference):]))
+                            self.quality_array[location+ref_length:]))
 
     def apply_errors(self, err_model: SequencingErrorModel):
         """
@@ -169,14 +165,14 @@ def apply_errors(self, err_model: SequencingErrorModel):
         :param err_model: The error model for this run,
         :return: None, The sequence, with errors applied
         """
-        mutated_sequence = ""
+        mutated_sequence = MutableSeq(self.read_sequence)
         for error in self.errors:
             # Replace the entire ref sequence with the entire alt sequence
             mutated_sequence = \
-                self.read_sequence[:error.location] + error.alt + self.read_sequence[error.location+len(error.ref):]
+                mutated_sequence[:error.location] + error.alt + mutated_sequence[error.location+len(error.ref):]
             # update quality score for error
             self.update_quality_array(
-                error.ref,
+                len(error.ref),
                 error.alt,
                 error.location,
                 "error",
@@ -194,7 +190,7 @@ def apply_mutations(self, quality_scores: list, mut_model: MutationModel):
         :param mut_model: The mutation model for the run, used for the zygosity and for the rng
         :return: mutated sequence, with mutations applied
         """
-        mutated_sequence = ""
+        mutated_sequence = MutableSeq(self.read_sequence)
         for location in self.mutations:
             variant_to_apply = mut_model.rng.choice(self.mutations[location])
 
@@ -220,7 +216,7 @@ def apply_mutations(self, quality_scores: list, mut_model: MutationModel):
                     alternate = self.read_sequence[position]
                 else:
                     reference_length = variant_to_apply.get_ref_len()
-                    alternate = self.read_sequence.get_alt()
+                    alternate = variant_to_apply.get_alt()
 
                 # Replace the entire ref with the entire alt
                 mutated_sequence = \
@@ -229,7 +225,7 @@ def apply_mutations(self, quality_scores: list, mut_model: MutationModel):
                     mutated_sequence[position + reference_length:]
 
                 self.update_quality_array(
-                    self.reference_segment[location: location+reference_length],
+                    reference_length,
                     alternate,
                     location,
                     "mutation",
@@ -237,7 +233,8 @@ def apply_mutations(self, quality_scores: list, mut_model: MutationModel):
                     qual_score
                 )
 
-        return mutated_sequence
+        # Update the read sequence with the applied mutations
+        self.read_sequence = Seq(mutated_sequence)
 
     def apply_variants_for_final_output(
             self,
@@ -262,8 +259,6 @@ def apply_variants_for_final_output(
         # The segments we hold off truncating because we need them for alignments,
         # but the quality array is fine to trim here.
         self.quality_array = self.quality_array[:self.length]
-        # Convert back to immutable
-        self.read_sequence = Seq(self.read_sequence)
 
     def contains(self, test_pos: int):
         return self.position <= test_pos < self.end_point
@@ -303,6 +298,7 @@ def finalize_read_and_write(
             err_model: SequencingErrorModel,
             mut_model: MutationModel,
             fastq_handle: TextIO,
+            quality_offset: int,
             produce_fastq: bool,
     ):
         """
@@ -311,6 +307,7 @@ def finalize_read_and_write(
         :param err_model: The error model for the run
         :param mut_model: The mutation model for the run
         :param fastq_handle: the path to the fastq model to write the read
+        :param quality_offset: the quality offset for this run
         :param produce_fastq: If true, this will write out the temp fastqs. If false, this will only write out the tsams
             to create the bam files.
         """
@@ -322,18 +319,25 @@ def finalize_read_and_write(
         # It updates the quality array and reference segment in place, including reversing them, if appropriate
         self.convert_masking(err_model)
 
-        # Get errors for the rea and update the quality score
-        self.errors, self.padding = err_model.get_sequencing_errors(self.length, self.padding, self.reference_segment,
-                                                                    self.quality_array)
-
-        # Update the read sequence to match the reference
+        # set the read sequence to match the reference
         self.read_sequence = self.reference_segment
 
+        # Get errors for the rea and update the quality score
+        self.errors, self.padding = err_model.get_sequencing_errors(
+            self.length,
+            self.padding,
+            self.reference_segment,
+            self.quality_array
+        )
+
         # This applies any variants, updates quality score and read sequence in place
         self.apply_variants_for_final_output(err_model, mut_model)
 
-        self.read_quality_string = "".join([chr(x + self.quality_offset) for x in self.quality_array])
-        self.mapping_quality = abs(err_model.rng.poisson(70))
+        self.read_quality_string = "".join([chr(x + quality_offset) for x in self.quality_array])
+        # If this read isn't low quality, pick a standard mapping quality.
+        # We could have this be user assigned.
+        if not self.mapping_quality:
+            self.mapping_quality = 70
 
         if produce_fastq:
             fastq_handle.write(f'@{self.name}\n')
@@ -352,29 +356,26 @@ def convert_masking(self, error_model: SequencingErrorModel):
         bad_score = min(error_model.quality_scores)
         # we'll use generic human repeats, as commonly found in masked regions. We may refine this to make configurable
         repeat_bases = list("TTAGGG")
-        modified_segment = ""
-        modified_quality_array = self.quality_array.copy()
         if self.is_reverse:
-            raw_segment = self.reference_segment.reverse_complement()
-            modified_quality_array = modified_quality_array[::-1]
+            raw_sequence = self.reference_segment.reverse_complement().upper()
+            self.quality_array = self.quality_array[::-1]
         else:
-            raw_segment = self.reference_segment
-
-        for i in range(len(raw_segment)):
-            base = raw_segment[i]
-            if base.islower():
-                modified_base = base.upper()
-                modified_quality_array[i] = bad_score
-            else:
-                modified_base = base
-            if modified_base in ALLOWED_NUCL:
-                modified_segment += modified_base
-            else:
-                modified_segment += repeat_bases[i % 6]
-                modified_quality_array[i] = bad_score
+            raw_sequence = self.reference_segment.upper()
+
+        start = raw_sequence.find('N')
+        if start != -1:
+            modified_segment = MutableSeq(raw_sequence[:start])
+            for i in range(start, len(raw_sequence)):
+                base = raw_sequence[i]
+                if base in ALLOWED_NUCL:
+                    modified_segment += base
+                else:
+                    modified_segment += repeat_bases[i % 6]
+                    self.num_ns += 1
+        else:
+            modified_segment = MutableSeq(raw_sequence)
 
         self.reference_segment = Seq(modified_segment)
-        self.quality_array = modified_quality_array
 
     def make_cigar(self):
         """
@@ -383,18 +384,23 @@ def make_cigar(self):
 
         # These parameters were set to minimize breaks in the mutated sequence and find the best
         # alignment from there.
-
-        # We're ignoring soft-masking (lower case letters)
-        if self.is_reverse:
-            read_sequence = self.read_sequence.reverse_complement().upper()
-        else:
-            read_sequence = self.read_sequence.upper()
-        reference_segment = self.reference_segment.upper()
         raw_alignment = pairwise2.align.localms(
-            reference_segment, read_sequence, match=1, mismatch=-1, open=-0.5, extend=-0.1,
+            self.reference_segment, self.read_sequence, match=1, mismatch=-1, open=-0.5, extend=-0.1,
             penalize_extend_when_opening=True
         )
 
+        is_n_heavy = self.num_ns > 10
+
+        # Recall that read_length//5 was what we used for padding.
+        crap_alignment = False
+        if raw_alignment[0][1].count('-') > self.length//10:
+            # Crap alignment
+            crap_alignment = True
+
+        n_to_crap_correlation = False
+        if is_n_heavy and crap_alignment:
+            n_to_crap_correlation = True
+
         alignment = format_alignment(*raw_alignment[0], full_sequences=True).split()
         aligned_template_seq = alignment[0]
         aligned_mut_seq = alignment[-2]
@@ -403,7 +409,7 @@ def make_cigar(self):
         curr_char = ''
         cig_string = ''
         # Find first match
-        for char in range(len(read_sequence)):
+        for char in range(len(self.read_sequence)):
             if aligned_template_seq[char] == '-':  # insertion
                 if curr_char == 'I':  # more insertions
                     cig_count += 1
@@ -417,7 +423,8 @@ def make_cigar(self):
                 if curr_char == 'D':  # more deletions
                     cig_count += 1
                 else:  # new deletion
-                    cig_string = cig_string + str(cig_count) + curr_char
+                    if cig_count != 0:
+                        cig_string = cig_string + str(cig_count) + curr_char
                     curr_char = 'D'
                     cig_count = 1
             else:  # match
@@ -427,7 +434,7 @@ def make_cigar(self):
                 else:  # new match
                     # If there is anything before this, add it to the string and increment,
                     # else, just increment
-                    if not cig_count == 0:
+                    if cig_count != 0:
                         cig_string = cig_string + str(cig_count) + curr_char
                     curr_char = 'M'
                     cig_count = 1
@@ -436,9 +443,11 @@ def make_cigar(self):
                 break
 
         if cig_length < self.length:
-            raise ValueError("Problem creating cigar string")
+            # testing a theory
+            pass
+            # raise ValueError("Problem creating cigar string")
         # append the final section as we return
-        return cig_string + str(cig_count) + curr_char
+        return cig_string + str(cig_count) + curr_char, n_to_crap_correlation, crap_alignment
 
     def get_mpos(self):
         """
@@ -467,3 +476,8 @@ def get_tlen(self):
                     return length
         else:
             return 0
+
+    def align(self):
+        """
+        Because of the way we have set up the segments, it should be easy to align these.
+        """

From 5250f56e41046bf003e480db58ff24021264dba6 Mon Sep 17 00:00:00 2001
From: Joshua Allen <jallen17@illinois.edu>
Date: Sat, 25 May 2024 03:12:14 -0500
Subject: [PATCH 8/8] It's passing my tests and I think it's ready to merge te
 develop

---
 neat/common/logging.py                        |  2 ++
 neat/read_simulator/runner.py                 |  6 ++--
 neat/read_simulator/utils/generate_reads.py   |  6 ++--
 .../utils/output_file_writer.py               | 32 +++++++++++--------
 4 files changed, 26 insertions(+), 20 deletions(-)

diff --git a/neat/common/logging.py b/neat/common/logging.py
index eab3b512..92de3d37 100644
--- a/neat/common/logging.py
+++ b/neat/common/logging.py
@@ -57,4 +57,6 @@ def setup_logging(
 
     logging.basicConfig(**kwargs)
 
+    logging.getLogger(__name__).info(f"writing log to: {log_file}")
+
 
diff --git a/neat/read_simulator/runner.py b/neat/read_simulator/runner.py
index 7b408944..7645f211 100644
--- a/neat/read_simulator/runner.py
+++ b/neat/read_simulator/runner.py
@@ -9,8 +9,8 @@
 from Bio import SeqIO
 from pathlib import Path
 
-from .utils import Options, parse_input_vcf, parse_beds, fill_out_bed_dict, OutputFileWriter, find_file_breaks, \
-    map_chromosome, generate_variants, write_local_file, generate_reads
+from .utils import Options, parse_input_vcf, parse_beds, OutputFileWriter, find_file_breaks, \
+    generate_variants, write_local_file, generate_reads
 from ..common import validate_input_path, validate_output_path
 from ..models import MutationModel, SequencingErrorModel, FragmentLengthModel, GcModel
 from ..models.default_cancer_mutation_model import *
@@ -45,7 +45,7 @@ def initialize_all_models(options: Options):
 
     cancer_model = None
     if options.cancer and options.cancer_model:
-        cancer_model = pickle.load(gzip.open(options.cancer_model))
+        # cancer_model = pickle.load(gzip.open(options.cancer_model))
         # Set the rng for the cancer mutation model
         cancer_model.rng = options.rng
     elif options.cancer:
diff --git a/neat/read_simulator/utils/generate_reads.py b/neat/read_simulator/utils/generate_reads.py
index 95f45b77..09236eb7 100644
--- a/neat/read_simulator/utils/generate_reads.py
+++ b/neat/read_simulator/utils/generate_reads.py
@@ -6,13 +6,11 @@
 from math import ceil
 from pathlib import Path
 from Bio import SeqRecord
-from Bio.Seq import Seq
-from numpy.random import Generator
 from bisect import bisect_left, bisect_right
 
 from ...models import SequencingErrorModel, GcModel, FragmentLengthModel, MutationModel
 from .options import Options
-from ...common import open_input, open_output, ALLOWED_NUCL
+from ...common import open_output
 from ...variants import ContigVariants
 from .read import Read
 
@@ -412,7 +410,7 @@ def generate_reads(reference: SeqRecord,
             else:
                 singletons.append((None, read_2))
 
-    _LOG.debug(f"Contig fastq(s) written in: {(time.process_time() - t)/60:.2f} m")
+    _LOG.info(f"Contig fastq(s) written in: {(time.process_time() - t)/60:.2f} m")
 
     if options.produce_bam:
         # this will give us the proper read order of the elements, for the sam. They are easier to sort now
diff --git a/neat/read_simulator/utils/output_file_writer.py b/neat/read_simulator/utils/output_file_writer.py
index 365a77ba..b7b9aade 100755
--- a/neat/read_simulator/utils/output_file_writer.py
+++ b/neat/read_simulator/utils/output_file_writer.py
@@ -8,24 +8,19 @@
 
 import gzip
 import re
-import shutil
+import time
 from struct import pack
 import logging
-import pysam
-import numpy as np
 import pickle
 
 from Bio import bgzf
 from Bio import SeqIO
-from Bio.Seq import Seq, MutableSeq
-from typing import Iterator, TextIO
 from pathlib import Path
 from numpy.random import Generator
 
 from ...common import validate_output_path, open_output, open_input, ALLOWED_NUCL
 from .read import Read
 from .options import Options
-from .neat_cigar import CigarString
 
 _LOG = logging.getLogger(__name__)
 
@@ -203,6 +198,7 @@ def merge_temp_fastqs(
 
         paired_files = []
         singleton_files = []
+        t = time.process_time()
 
         # First split the files into 2 categories. For paired ended reads, most reads will be paired, though some may
         # have gotten filtered out with input files. We will append any singletons in paired ended mode at the end,
@@ -230,10 +226,10 @@ def merge_temp_fastqs(
         for file_pair in singleton_files:
             if file_pair[0]:
                 file_index = SeqIO.index(str(file_pair[0]), 'fastq')
-                contig_name = Path(file_pair[0]).name.strip('_r1_single.fq.bgz')
+                contig_name = Path(file_pair[0]).name.removesuffix('_r1_single.fq.bgz')
             elif file_pair[1]:
                 file_index = SeqIO.index(str(file_pair[1]), 'fastq')
-                contig_name = Path(file_pair[1]).name.strip('_r2_single.fq.bgz')
+                contig_name = Path(file_pair[1]).name.removesuffix('_r2_single.fq.bgz')
             else:
                 # So singletons for this contig, so move on
                 continue
@@ -259,10 +255,14 @@ def merge_temp_fastqs(
             open_output(self.fastq2_fn) as fq2
         ):
             # First we add all properly paired reads
-            for i in range(len(shuffled_paired_keys)):
+            num_reads = len(shuffled_paired_keys)
+            for i in range(num_reads):
+                print(f'{i/num_reads:.2%}', end='\r')
                 current_key = shuffled_paired_keys[i]
                 # reconstruct tho chromosome name
-                chrom_name = current_key[0].removeprefix("NEAT-generated_").split('_')[0]
+                chrom_name_with_rdnm = current_key[0].removeprefix("NEAT-generated_").split('/')[0]
+                suffix = re.findall(r"_\d*$", chrom_name_with_rdnm)[0]
+                chrom_name = chrom_name_with_rdnm.removesuffix(suffix)
                 # 1 here because this is read1
                 read1 = fastq_index_dict[chrom_name][1][current_key[0]]
                 SeqIO.write(read1, fq1, 'fastq')
@@ -275,13 +275,16 @@ def merge_temp_fastqs(
             # Next we add the strays (or all reads, for single-ended)
             for j in range(len(shuffled_singleton_keys)):
                 current_key = shuffled_singleton_keys[j]
-                chrom_name = current_key[0].removeprefix("NEAT_generated_").split('_')[0]
-                read = fastq_index_dict[chrom_name][1][current_key[0]]
+                chrom_name_with_rdnm = current_key.removeprefix("NEAT-generated_").split('/')[0]
+                suffix = re.findall(r"_\d*$", chrom_name_with_rdnm)[0]
+                chrom_name = chrom_name_with_rdnm.removesuffix(suffix)
+                read = fastq_index_dict[chrom_name][1][current_key]
                 SeqIO.write(read, fq1, 'fastq')
 
         if not wrote_r2:
             fastq2_path = Path(self.fastq2_fn)
             fastq2_path.unlink()
+        _LOG.info(f"Fastq(s) written in {(time.process_time() - t)/60:.2f} m")
 
     def output_bam_file(self, reads_files: list, contig_dict: dict, read_length: int):
         """
@@ -293,6 +296,7 @@ def output_bam_file(self, reads_files: list, contig_dict: dict, read_length: int
             and the values the index of that contig
         :param read_length: the length of the reads for this run
         """
+        t = time.process_time()
         bam_out = bgzf.BgzfWriter(self.bam_fn, 'w', compresslevel=BAM_COMPRESSION_LEVEL)
         bam_out.write("BAM\1")
         header = "@HD\tVN:1.4\tSO:coordinate\n"
@@ -330,9 +334,11 @@ def output_bam_file(self, reads_files: list, contig_dict: dict, read_length: int
                 elif is_crap:
                     total_crap += 1
         if total_crap:
-            print(f"n-to-crap correlation: {n_to_crap_correlation/total_crap:.2%}")
+            _LOG.info(f"n-to-crap correlation: {n_to_crap_correlation/total_crap:.2%}")
         bam_out.close()
 
+        _LOG.info(f"Bam written in: {(time.process_time() - t)/60:.2f} m")
+
     def write_bam_record(self, read: Read, contig_id: int, bam_handle: bgzf.BgzfWriter, read_length: int):
         """
         Takes a read object and writes it out as a bam record