Merge pull request #110 from ncsa/feature/simplifying_coverage

Feature/simplifying coverage
ncsa · May 25, 2024 · 1eafd28 · 1eafd28
2 parents 10c6022 + 5250f56
commit 1eafd28
Show file tree

Hide file tree

Showing 12 changed files with 715 additions and 706 deletions.
diff --git a/neat/cli/cli.py b/neat/cli/cli.py
@@ -140,7 +140,7 @@ def main(parser: argparse.ArgumentParser, arguments: list[str]) -> int:
         else:
             end = time.time()
             log.info(
-                "command finished successfully; execution took %.3f sec", end - start
+                f"command finished successfully; execution took {(end - start)/60:.2f} m"
             )
         return 0
 

diff --git a/neat/common/logging.py b/neat/common/logging.py
@@ -57,4 +57,6 @@ def setup_logging(
 
     logging.basicConfig(**kwargs)
 
+    logging.getLogger(__name__).info(f"writing log to: {log_file}")
+
 
diff --git a/neat/common/ploid_functions.py b/neat/common/ploid_functions.py
@@ -28,24 +28,18 @@ def pick_ploids(ploidy: int,
     :return: a list of strings representing the genotype of each ploid.
     """
     # number of ploids to make this mutation on (always at least 1)
+    how_many = 1
     if rng.random() < homozygous_frequency:
-        if ploidy <= 2:
-            # If it's homozygous with a ploidy of 2 it's on both.
-            how_many = ploidy
-        else:
-            # if it's polyploid, it's on more than one ploid
-            # TODO may need to improve the modeling for polyploid
-            how_many = 1
-            for i in range(ploidy):
-                # Not totally sure how to model this, so I'm counting each
-                # ploid as a separate homozygous event. That doesn't exactly make
-                # sense though, so we'll improve this later.
-                if rng.random() < homozygous_frequency:
-                    how_many += 1
-                else:
-                    break
+        # We'll consider this one to be homozygous
+        how_many = ploidy
     else:
-        how_many = 1
+        if ploidy == 1:
+            # special case where heteroyzgous makes no sense
+            how_many = 1
+        else:
+            # if it's polyploid, we'll consider it to be on roughly half the ploids
+            # TODO may need to improve the modeling for polyploid, maybe
+            how_many = ploidy//2
 
     # wp is just the temporary genotype list, a hat tip to the old version of NEAT
     wp = np.zeros(ploidy)

diff --git a/neat/models/default_fraglen_model.py b/neat/models/default_fraglen_model.py
diff --git a/neat/models/models.py b/neat/models/models.py
@@ -21,7 +21,6 @@
 from .default_mutation_model import *
 from .default_sequencing_error_model import *
 from .default_gc_bias_model import *
-from .default_fraglen_model import *
 from .utils import bin_scores, take_closest
 
 __all__ = [
@@ -389,16 +388,17 @@ def __init__(self,
 
     def get_sequencing_errors(self,
                               read_length: int,
+                              padding: int,
                               reference_segment: SeqRecord,
                               quality_scores: np.ndarray):
         """
         Inserts errors of type substitution, insertion, or deletion into read_data, and assigns a quality score
         based on the container model.
         :param read_length: The length of the read to generate errors for.
+        :param padding: this is the amount of space we have in the read for deletions.
         :param reference_segment: The section of the reference from which the read is drawn
         :param quality_scores: Array of quality scores for the read
-        :return: modified sequence and associated quality scores
-
+        :return: Modified sequence and associated quality scores
         """
 
         error_indexes = []
@@ -413,7 +413,7 @@ def get_sequencing_errors(self,
                 if self.rng.random() < self.quality_score_error_rate[quality_scores[i]]:
                     error_indexes.append(i)
 
-        num_indels_so_far = 0
+        total_indel_length = 0
         # To prevent deletion collisions
         del_blacklist = []
 
@@ -424,19 +424,24 @@ def get_sequencing_errors(self,
             # Not too sure about how realistic it is to model errors as indels, but I'm leaving the code in for now.
 
             # This is to prevent deletion error collisions and to keep there from being too many indel errors.
-            if 0 < index < self.read_length - max(self.deletion_len_model) and num_indels_so_far > self.read_length//2:
+            if 0 < index < self.read_length - max(self.deletion_len_model) and total_indel_length > self.read_length//4:
                 error_type = self.rng.choice(a=list(self.variant_probs), p=list(self.variant_probs.values()))
 
             # Deletion error
             if error_type == Deletion:
                 deletion_length = self.get_deletion_length()
+                if padding - deletion_length < 0:
+                    # No space in this read to add this deletion
+                    continue
                 deletion_reference = reference_segment.seq[index: index + deletion_length + 1]
                 deletion_alternate = deletion_reference[0]
                 introduced_errors.append(
                     ErrorContainer(Deletion, index, deletion_length, deletion_reference, deletion_alternate)
                 )
-                num_indels_so_far += deletion_length
+                total_indel_length += deletion_length
+
                 del_blacklist.extend(list(range(index, index + deletion_length)))
+                padding -= deletion_length
 
             elif error_type == Insertion:
                 insertion_length = self.get_insertion_length()
@@ -446,7 +451,7 @@ def get_sequencing_errors(self,
                 introduced_errors.append(
                     ErrorContainer(Insertion, index, insertion_length, insertion_reference, insertion_alternate)
                 )
-                num_indels_so_far += insertion_length
+                total_indel_length += insertion_length
 
             # Insert substitution error
             # Programmer note: if you add new error types, they can be added as elifs above, leaving the final
@@ -465,7 +470,7 @@ def get_sequencing_errors(self,
             if introduced_errors[i].location in del_blacklist:
                 del introduced_errors[i]
 
-        return introduced_errors
+        return introduced_errors, max(padding, 0)
 
     def quality_index_remap(self, input_read_length):
         """
@@ -600,53 +605,27 @@ class FragmentLengthModel:
 
     :param fragment_mean: the mean of the collection of fragment lengths derived from data
     :param fragment_std: the standard deviation of the collection of fragment lengths derived from data
-    :param fragment_max: the largest fragment observed in the data
-    :param fragment_min: the smallest fragment observed in data
     :param rng: the random number generator for the run
     """
 
     def __init__(self,
-                 fragment_mean: float = None,
-                 fragment_std: float = None,
-                 fragment_max: int = None,
-                 fragment_min: int = None,
+                 fragment_mean: float,
+                 fragment_std: float,
                  rng: Generator = None):
-        self.fragment_mean = fragment_mean if fragment_mean else default_fragment_mean
-        self.fragment_st_dev = fragment_std if fragment_std else default_fragment_std
-        self.fragment_max = fragment_max if fragment_max else default_fragment_max
-        self.fragment_min = fragment_min if fragment_min else default_fragment_min
+        self.fragment_mean = fragment_mean
+        self.fragment_st_dev = fragment_std
         self.rng = rng
 
     def generate_fragments(self,
                            total_length: int,
-                           read_length: int,
-                           coverage: int) -> list:
+                           number_of_fragments: int) -> list:
         """
         Generates a number of fragments based on the total length needed, and the mean and standard deviation of the set
 
         :param total_length: Length of the reference segment we are covering.
-        :param read_length: average length of the reads
-        :param coverage: the target coverage number
+        :param number_of_fragments: The number of fragments needed.
         :return: A list of fragment random fragment lengths sampled from the model.
         """
-        # Estimate the number of fragments needed (with a 2x padding)
-        number_of_fragments = int(round(total_length / read_length) * (coverage * 2))
-        # Check that we don't have unusable values for fragment mean. Too many fragments under the read length means
-        # NEAT will either get caught in an infinite cycle of sampling fragments but never finding one that works, or
-        # it will only find a few and will run very slowly.
-        if self.fragment_mean < read_length:
-            # Let's just reset the fragment mean to make up for this.
-            self.fragment_mean = read_length
         # generates a distribution, assuming normality, then rounds the result and converts to ints
         dist = np.round(self.rng.normal(self.fragment_mean, self.fragment_st_dev, size=number_of_fragments)).astype(int)
-        # filter the list to throw out outliers and to set anything under the read length to the read length.
-        dist = [max(x, read_length) for x in dist if x <= self.fragment_max]
-        # Just a sanity check to make sure our data isn't too thin:
-        while number_of_fragments - len(dist) > 0:
-            additional_fragment = self.rng.normal(loc=self.fragment_mean, scale=self.fragment_st_dev)
-            if additional_fragment < read_length:
-                continue
-            dist.append(round(additional_fragment))
-
-        # Now set a minimum on the dataset. Any fragment smaller than read_length gets turned into read_length
-        return dist
+        return [abs(x) for x in dist]
diff --git a/neat/read_simulator/runner.py b/neat/read_simulator/runner.py
@@ -9,8 +9,8 @@
 from Bio import SeqIO
 from pathlib import Path
 
-from .utils import Options, parse_input_vcf, parse_beds, fill_out_bed_dict, OutputFileWriter, find_file_breaks, \
-    map_chromosome, generate_variants, write_local_file, generate_reads
+from .utils import Options, parse_input_vcf, parse_beds, OutputFileWriter, find_file_breaks, \
+    generate_variants, write_local_file, generate_reads
 from ..common import validate_input_path, validate_output_path
 from ..models import MutationModel, SequencingErrorModel, FragmentLengthModel, GcModel
 from ..models.default_cancer_mutation_model import *
@@ -45,7 +45,7 @@ def initialize_all_models(options: Options):
 
     cancer_model = None
     if options.cancer and options.cancer_model:
-        cancer_model = pickle.load(gzip.open(options.cancer_model))
+        # cancer_model = pickle.load(gzip.open(options.cancer_model))
         # Set the rng for the cancer mutation model
         cancer_model.rng = options.rng
     elif options.cancer:
@@ -98,15 +98,16 @@ def initialize_all_models(options: Options):
 
     _LOG.debug('GC Bias model loaded')
 
-    fraglen_model = None
-    if options.paired_ended:
-        if options.fragment_model:
-            fraglen_model = pickle.load(gzip.open(options.fragment_model))
-        else:
-            fraglen_model = FragmentLengthModel(options.fragment_mean, options.fragment_st_dev)
-
-        # Set the rng for the fragment length model
+    if options.fragment_model:
+        fraglen_model = pickle.load(gzip.open(options.fragment_model))
         fraglen_model.rng = options.rng
+    elif options.fragment_mean:
+        fraglen_model = FragmentLengthModel(options.fragment_mean, options.fragment_st_dev, rng=options.rng)
+    else:
+        # For single ended, fragment length will be based on read length
+        fragment_mean = options.read_len * 1.5
+        fragment_st_dev = fragment_mean * 0.2
+        fraglen_model = FragmentLengthModel(fragment_mean, fragment_st_dev, options.rng)
 
     _LOG.debug("Fragment length model loaded")
 
@@ -174,6 +175,7 @@ def read_simulator_runner(config: str, output: str):
     """
     _LOG.info(f'Reading {options.reference}.')
 
+    # TODO check into SeqIO.index_db()
     reference_index = SeqIO.index(str(options.reference), "fasta")
     _LOG.debug("Reference file indexed.")
 
@@ -300,7 +302,7 @@ def read_simulator_runner(config: str, output: str):
             pass
 
         if options.paired_ended:
-            max_qual_score = max(seq_error_model_1.quality_scores + seq_error_model_2.quality_scores)
+            max_qual_score = max(max(seq_error_model_1.quality_scores), max(seq_error_model_2.quality_scores))
         else:
             max_qual_score = max(seq_error_model_1.quality_scores)
 
@@ -329,11 +331,12 @@ def read_simulator_runner(config: str, output: str):
             fasta_files.extend(local_fasta_file)
 
         if options.produce_fastq or options.produce_bam:
-            read1_fastq, read2_fastq = \
+            read1_fastq_paired, read1_fastq_single, read2_fastq_paired, read2_fastq_single = \
                 generate_reads(local_reference,
                                local_bam_pickle_file,
                                seq_error_model_1,
                                seq_error_model_2,
+                               mut_model,
                                gc_bias_model,
                                fraglen_model,
                                local_variants,
@@ -343,7 +346,8 @@ def read_simulator_runner(config: str, output: str):
                                options,
                                contig)
 
-            fastq_files.append((read1_fastq, read2_fastq))
+            contig_temp_fastqs = ((read1_fastq_paired, read2_fastq_paired), (read1_fastq_single, read2_fastq_single))
+            fastq_files.append(contig_temp_fastqs)
             if options.produce_bam:
                 sam_reads_files.append(local_bam_pickle_file)
 
@@ -367,4 +371,4 @@ def read_simulator_runner(config: str, output: str):
         _LOG.info(f"Outputting golden bam file: {str(output_file_writer.bam_fn)}")
         contig_list = list(reference_index)
         contigs_by_index = {contig_list[n]: n for n in range(len(contig_list))}
-        output_file_writer.output_bam_file(sam_reads_files, contigs_by_index)
+        output_file_writer.output_bam_file(sam_reads_files, contigs_by_index, options.read_len)