Skip to content

Commit

Permalink
Merge pull request #110 from ncsa/feature/simplifying_coverage
Browse files Browse the repository at this point in the history
Feature/simplifying coverage
  • Loading branch information
joshfactorial committed May 25, 2024
2 parents 10c6022 + 5250f56 commit 1eafd28
Show file tree
Hide file tree
Showing 12 changed files with 715 additions and 706 deletions.
2 changes: 1 addition & 1 deletion neat/cli/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,7 @@ def main(parser: argparse.ArgumentParser, arguments: list[str]) -> int:
else:
end = time.time()
log.info(
"command finished successfully; execution took %.3f sec", end - start
f"command finished successfully; execution took {(end - start)/60:.2f} m"
)
return 0

Expand Down
2 changes: 2 additions & 0 deletions neat/common/logging.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,4 +57,6 @@ def setup_logging(

logging.basicConfig(**kwargs)

logging.getLogger(__name__).info(f"writing log to: {log_file}")


26 changes: 10 additions & 16 deletions neat/common/ploid_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,24 +28,18 @@ def pick_ploids(ploidy: int,
:return: a list of strings representing the genotype of each ploid.
"""
# number of ploids to make this mutation on (always at least 1)
how_many = 1
if rng.random() < homozygous_frequency:
if ploidy <= 2:
# If it's homozygous with a ploidy of 2 it's on both.
how_many = ploidy
else:
# if it's polyploid, it's on more than one ploid
# TODO may need to improve the modeling for polyploid
how_many = 1
for i in range(ploidy):
# Not totally sure how to model this, so I'm counting each
# ploid as a separate homozygous event. That doesn't exactly make
# sense though, so we'll improve this later.
if rng.random() < homozygous_frequency:
how_many += 1
else:
break
# We'll consider this one to be homozygous
how_many = ploidy
else:
how_many = 1
if ploidy == 1:
# special case where heteroyzgous makes no sense
how_many = 1
else:
# if it's polyploid, we'll consider it to be on roughly half the ploids
# TODO may need to improve the modeling for polyploid, maybe
how_many = ploidy//2

# wp is just the temporary genotype list, a hat tip to the old version of NEAT
wp = np.zeros(ploidy)
Expand Down
8 changes: 0 additions & 8 deletions neat/models/default_fraglen_model.py

This file was deleted.

61 changes: 20 additions & 41 deletions neat/models/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@
from .default_mutation_model import *
from .default_sequencing_error_model import *
from .default_gc_bias_model import *
from .default_fraglen_model import *
from .utils import bin_scores, take_closest

__all__ = [
Expand Down Expand Up @@ -389,16 +388,17 @@ def __init__(self,

def get_sequencing_errors(self,
read_length: int,
padding: int,
reference_segment: SeqRecord,
quality_scores: np.ndarray):
"""
Inserts errors of type substitution, insertion, or deletion into read_data, and assigns a quality score
based on the container model.
:param read_length: The length of the read to generate errors for.
:param padding: this is the amount of space we have in the read for deletions.
:param reference_segment: The section of the reference from which the read is drawn
:param quality_scores: Array of quality scores for the read
:return: modified sequence and associated quality scores
:return: Modified sequence and associated quality scores
"""

error_indexes = []
Expand All @@ -413,7 +413,7 @@ def get_sequencing_errors(self,
if self.rng.random() < self.quality_score_error_rate[quality_scores[i]]:
error_indexes.append(i)

num_indels_so_far = 0
total_indel_length = 0
# To prevent deletion collisions
del_blacklist = []

Expand All @@ -424,19 +424,24 @@ def get_sequencing_errors(self,
# Not too sure about how realistic it is to model errors as indels, but I'm leaving the code in for now.

# This is to prevent deletion error collisions and to keep there from being too many indel errors.
if 0 < index < self.read_length - max(self.deletion_len_model) and num_indels_so_far > self.read_length//2:
if 0 < index < self.read_length - max(self.deletion_len_model) and total_indel_length > self.read_length//4:
error_type = self.rng.choice(a=list(self.variant_probs), p=list(self.variant_probs.values()))

# Deletion error
if error_type == Deletion:
deletion_length = self.get_deletion_length()
if padding - deletion_length < 0:
# No space in this read to add this deletion
continue
deletion_reference = reference_segment.seq[index: index + deletion_length + 1]
deletion_alternate = deletion_reference[0]
introduced_errors.append(
ErrorContainer(Deletion, index, deletion_length, deletion_reference, deletion_alternate)
)
num_indels_so_far += deletion_length
total_indel_length += deletion_length

del_blacklist.extend(list(range(index, index + deletion_length)))
padding -= deletion_length

elif error_type == Insertion:
insertion_length = self.get_insertion_length()
Expand All @@ -446,7 +451,7 @@ def get_sequencing_errors(self,
introduced_errors.append(
ErrorContainer(Insertion, index, insertion_length, insertion_reference, insertion_alternate)
)
num_indels_so_far += insertion_length
total_indel_length += insertion_length

# Insert substitution error
# Programmer note: if you add new error types, they can be added as elifs above, leaving the final
Expand All @@ -465,7 +470,7 @@ def get_sequencing_errors(self,
if introduced_errors[i].location in del_blacklist:
del introduced_errors[i]

return introduced_errors
return introduced_errors, max(padding, 0)

def quality_index_remap(self, input_read_length):
"""
Expand Down Expand Up @@ -600,53 +605,27 @@ class FragmentLengthModel:
:param fragment_mean: the mean of the collection of fragment lengths derived from data
:param fragment_std: the standard deviation of the collection of fragment lengths derived from data
:param fragment_max: the largest fragment observed in the data
:param fragment_min: the smallest fragment observed in data
:param rng: the random number generator for the run
"""

def __init__(self,
fragment_mean: float = None,
fragment_std: float = None,
fragment_max: int = None,
fragment_min: int = None,
fragment_mean: float,
fragment_std: float,
rng: Generator = None):
self.fragment_mean = fragment_mean if fragment_mean else default_fragment_mean
self.fragment_st_dev = fragment_std if fragment_std else default_fragment_std
self.fragment_max = fragment_max if fragment_max else default_fragment_max
self.fragment_min = fragment_min if fragment_min else default_fragment_min
self.fragment_mean = fragment_mean
self.fragment_st_dev = fragment_std
self.rng = rng

def generate_fragments(self,
total_length: int,
read_length: int,
coverage: int) -> list:
number_of_fragments: int) -> list:
"""
Generates a number of fragments based on the total length needed, and the mean and standard deviation of the set
:param total_length: Length of the reference segment we are covering.
:param read_length: average length of the reads
:param coverage: the target coverage number
:param number_of_fragments: The number of fragments needed.
:return: A list of fragment random fragment lengths sampled from the model.
"""
# Estimate the number of fragments needed (with a 2x padding)
number_of_fragments = int(round(total_length / read_length) * (coverage * 2))
# Check that we don't have unusable values for fragment mean. Too many fragments under the read length means
# NEAT will either get caught in an infinite cycle of sampling fragments but never finding one that works, or
# it will only find a few and will run very slowly.
if self.fragment_mean < read_length:
# Let's just reset the fragment mean to make up for this.
self.fragment_mean = read_length
# generates a distribution, assuming normality, then rounds the result and converts to ints
dist = np.round(self.rng.normal(self.fragment_mean, self.fragment_st_dev, size=number_of_fragments)).astype(int)
# filter the list to throw out outliers and to set anything under the read length to the read length.
dist = [max(x, read_length) for x in dist if x <= self.fragment_max]
# Just a sanity check to make sure our data isn't too thin:
while number_of_fragments - len(dist) > 0:
additional_fragment = self.rng.normal(loc=self.fragment_mean, scale=self.fragment_st_dev)
if additional_fragment < read_length:
continue
dist.append(round(additional_fragment))

# Now set a minimum on the dataset. Any fragment smaller than read_length gets turned into read_length
return dist
return [abs(x) for x in dist]
34 changes: 19 additions & 15 deletions neat/read_simulator/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@
from Bio import SeqIO
from pathlib import Path

from .utils import Options, parse_input_vcf, parse_beds, fill_out_bed_dict, OutputFileWriter, find_file_breaks, \
map_chromosome, generate_variants, write_local_file, generate_reads
from .utils import Options, parse_input_vcf, parse_beds, OutputFileWriter, find_file_breaks, \
generate_variants, write_local_file, generate_reads
from ..common import validate_input_path, validate_output_path
from ..models import MutationModel, SequencingErrorModel, FragmentLengthModel, GcModel
from ..models.default_cancer_mutation_model import *
Expand Down Expand Up @@ -45,7 +45,7 @@ def initialize_all_models(options: Options):

cancer_model = None
if options.cancer and options.cancer_model:
cancer_model = pickle.load(gzip.open(options.cancer_model))
# cancer_model = pickle.load(gzip.open(options.cancer_model))
# Set the rng for the cancer mutation model
cancer_model.rng = options.rng
elif options.cancer:
Expand Down Expand Up @@ -98,15 +98,16 @@ def initialize_all_models(options: Options):

_LOG.debug('GC Bias model loaded')

fraglen_model = None
if options.paired_ended:
if options.fragment_model:
fraglen_model = pickle.load(gzip.open(options.fragment_model))
else:
fraglen_model = FragmentLengthModel(options.fragment_mean, options.fragment_st_dev)

# Set the rng for the fragment length model
if options.fragment_model:
fraglen_model = pickle.load(gzip.open(options.fragment_model))
fraglen_model.rng = options.rng
elif options.fragment_mean:
fraglen_model = FragmentLengthModel(options.fragment_mean, options.fragment_st_dev, rng=options.rng)
else:
# For single ended, fragment length will be based on read length
fragment_mean = options.read_len * 1.5
fragment_st_dev = fragment_mean * 0.2
fraglen_model = FragmentLengthModel(fragment_mean, fragment_st_dev, options.rng)

_LOG.debug("Fragment length model loaded")

Expand Down Expand Up @@ -174,6 +175,7 @@ def read_simulator_runner(config: str, output: str):
"""
_LOG.info(f'Reading {options.reference}.')

# TODO check into SeqIO.index_db()
reference_index = SeqIO.index(str(options.reference), "fasta")
_LOG.debug("Reference file indexed.")

Expand Down Expand Up @@ -300,7 +302,7 @@ def read_simulator_runner(config: str, output: str):
pass

if options.paired_ended:
max_qual_score = max(seq_error_model_1.quality_scores + seq_error_model_2.quality_scores)
max_qual_score = max(max(seq_error_model_1.quality_scores), max(seq_error_model_2.quality_scores))
else:
max_qual_score = max(seq_error_model_1.quality_scores)

Expand Down Expand Up @@ -329,11 +331,12 @@ def read_simulator_runner(config: str, output: str):
fasta_files.extend(local_fasta_file)

if options.produce_fastq or options.produce_bam:
read1_fastq, read2_fastq = \
read1_fastq_paired, read1_fastq_single, read2_fastq_paired, read2_fastq_single = \
generate_reads(local_reference,
local_bam_pickle_file,
seq_error_model_1,
seq_error_model_2,
mut_model,
gc_bias_model,
fraglen_model,
local_variants,
Expand All @@ -343,7 +346,8 @@ def read_simulator_runner(config: str, output: str):
options,
contig)

fastq_files.append((read1_fastq, read2_fastq))
contig_temp_fastqs = ((read1_fastq_paired, read2_fastq_paired), (read1_fastq_single, read2_fastq_single))
fastq_files.append(contig_temp_fastqs)
if options.produce_bam:
sam_reads_files.append(local_bam_pickle_file)

Expand All @@ -367,4 +371,4 @@ def read_simulator_runner(config: str, output: str):
_LOG.info(f"Outputting golden bam file: {str(output_file_writer.bam_fn)}")
contig_list = list(reference_index)
contigs_by_index = {contig_list[n]: n for n in range(len(contig_list))}
output_file_writer.output_bam_file(sam_reads_files, contigs_by_index)
output_file_writer.output_bam_file(sam_reads_files, contigs_by_index, options.read_len)
Loading

0 comments on commit 1eafd28

Please sign in to comment.