Skip to content

Commit

Permalink
Merge pull request #115 from ncsa/develop
Browse files Browse the repository at this point in the history
Develop
  • Loading branch information
joshfactorial committed Jul 1, 2024
2 parents f334556 + 6442510 commit f1c64b8
Show file tree
Hide file tree
Showing 21 changed files with 675 additions and 293 deletions.
2 changes: 1 addition & 1 deletion config_template/simple_template.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,6 @@ discard_bed: .
mutation_rate: .
mutation_bed: .
rng_seed: .
min_mutations: 0
min_mutations: .
overwrite_output: .

8 changes: 7 additions & 1 deletion neat/cli/commands/gen_mut_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,12 @@ def add_arguments(self, parser: argparse.ArgumentParser):
help="Includes a list of common variants, if you want to visualize "
"common variants with plot_mut_model.py.")

parser.add_argument('--save-trinuc',
action='store_true',
required=False,
default=False,
help='Saves trinucleotide counts to a separate file')

parser.add_argument('--overwrite',
action='store_true',
required=False,
Expand All @@ -83,5 +89,5 @@ def execute(self, arguments: argparse.Namespace):
:param arguments: The namespace with arguments and their values.
"""
compute_mut_runner(arguments.reference, arguments.mutations, arguments.bed, arguments.outcounts,
arguments.show_trinuc, arguments.human_sample,
arguments.show_trinuc, arguments.save_trinuc, arguments.human_sample,
arguments.skip_common, arguments.output, arguments.overwrite)
35 changes: 21 additions & 14 deletions neat/common/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,13 @@
import gzip
import logging
import os
import sys

from pathlib import Path
from typing import Callable, Iterator, TextIO
from Bio import bgzf

log = logging.getLogger(__name__)
_LOG = logging.getLogger(__name__)


def is_compressed(file: str | Path) -> bool:
Expand Down Expand Up @@ -85,9 +87,9 @@ def open_output(path: str | Path, mode: str = 'wt') -> Iterator[TextIO]:
Raises
------
FileExistsError
3 = FileExistsError
Raised if the output file already exists.
PermissionError
11 = PermissionError
Raised if the calling process does not have adequate access rights to
write to the output file.
"""
Expand All @@ -100,7 +102,8 @@ def open_output(path: str | Path, mode: str = 'wt') -> Iterator[TextIO]:
# bgzf is old code and doesn't use "xt" mode, annoyingly. This manual check should suffice.
if mode == "xt":
if output_path.exists():
raise FileExistsError(f"file '{path}' already exists")
_LOG.error(f"file '{path}' already exists")
sys.exit(3)
else:
mode = "wt"
open_ = bgzf.open
Expand All @@ -125,26 +128,28 @@ def validate_input_path(path: str | Path):
Raises
------
FileNotFoundError
5 = FileNotFoundError
Raised if the input file does not exist or is not a file.
RuntimeError
7 = RuntimeError
Raised if the input file is empty.
PermissionError
9 = PermissionError
Raised if the calling process has no read access to the file.
"""
path = Path(path)
mssg = ''

if not path.is_file():
mssg += f"Path '{path}' does not exist or not a file"
raise FileNotFoundError(mssg)
_LOG.error(mssg)
sys.exit(5)
stats = path.stat()
if stats.st_size == 0:
mssg += f"File '{path}' is empty"
raise RuntimeError(mssg)
_LOG.error(mssg)
sys.exit(7)
if not os.access(path, os.R_OK):
mssg += f"cannot read from '{path}': access denied"
raise PermissionError(mssg)
_LOG.error(9)


def validate_output_path(path: str | Path, is_file: bool = True, overwrite: bool = False):
Expand All @@ -161,18 +166,20 @@ def validate_output_path(path: str | Path, is_file: bool = True, overwrite: bool
Raises
------
FileExistsError
3 = FileExistsError
Raised if path is a file and already exists.
PermissionError
11 = PermissionError
Raised if the calling process does not have adequate access rights to.
"""
path = Path(path)
if is_file:
if path.is_file() and not overwrite:
raise FileExistsError(f"file '{path}' already exists")
_LOG.error(f"file '{path}' already exists")
sys.exit(3)
else:
if path.is_dir():
if not os.access(path, os.W_OK):
raise PermissionError(f"cannot write to '{path}', access denied")
_LOG.error(f"cannot write to '{path}', access denied")
sys.exit(11)
else:
path.parent.mkdir(parents=True, exist_ok=True)
12 changes: 6 additions & 6 deletions neat/gen_mut_model/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,11 @@
import os.path
import pathlib
import pickle
import sys

import numpy as np
from Bio import SeqIO


from pathlib import Path
import logging

Expand Down Expand Up @@ -81,7 +81,7 @@ def runner(reference_index,

if len(ignore) == len(reference_index):
_LOG.error("No valid human chromosome names detected. Check contig names reference.")
raise ValueError
sys.exit(1)

# Pre-parsing to find all the matching chromosomes between ref and vcf
_LOG.info('Processing VCF file...')
Expand All @@ -91,7 +91,7 @@ def runner(reference_index,

if not matching_variants or not matching_chromosomes:
_LOG.error("No valid variants detected. Check names in vcf versus reference and/or bed.")
raise ValueError
sys.exit(1)

trinuc_ref_count, bed_track_length = count_trinucleotides(reference_index,
bed,
Expand All @@ -100,7 +100,7 @@ def runner(reference_index,

if not trinuc_ref_count:
_LOG.error("No valid trinucleotides detected in reference.")
raise ValueError
sys.exit(1)

"""
Collect and analyze the data in the VCF file
Expand Down Expand Up @@ -155,7 +155,7 @@ def runner(reference_index,

else:
_LOG.error(f'Ref allele in variant call does not match reference: {variant}')
raise ValueError
sys.exit(1)

else:
indel_len = len(variant[3]) - len(variant[2])
Expand Down Expand Up @@ -214,7 +214,7 @@ def runner(reference_index,
if not total_var:
_LOG.error('Error: No valid variants were found, model could not be created. '
'Check that names are compatible.')
raise ValueError
sys.exit(1)

# COMPUTE PROBABILITIES

Expand Down
2 changes: 1 addition & 1 deletion neat/gen_mut_model/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -315,7 +315,7 @@ def convert_trinuc_transition_matrix(trans_probs):
_LOG.error("Repeat Trinuc detected.")
_LOG.debug(f'Error on {ALL_CONTEXTS[context]}: '
f'{ALLOWED_NUCL[mutation_ref]} -> {ALLOWED_NUCL[mutation_alt]}')
raise ValueError
sys.exit(1)

return ret_matrix

Expand Down
11 changes: 6 additions & 5 deletions neat/model_fragment_lengths/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import pickle
import numpy as np
import logging
import sys

from pathlib import Path

Expand Down Expand Up @@ -44,21 +45,21 @@ def compute_fraglen_runner(file: str | Path, filter_minreads: int, output: str |
_LOG.info("Counting fragments")
all_tlens = count_frags(input_file)
if not all_tlens:
raise ValueError("No valid template lengths in sam file_list.")
_LOG.error("No valid template lengths in sam file_list.")
sys.exit(1)

_LOG.info("Filtering fragments")
filtered_lengths = filter_lengths(all_tlens, filter_minreads)

if not filtered_lengths:
raise ValueError("No data passed the filter, nothing to calculate. Try adjusting the filter settings.")
_LOG.error("No data passed the filter, nothing to calculate. Try adjusting the filter settings.")
sys.exit(1)

_LOG.info("Building model")
st_dev = float(np.std(filtered_lengths))
mean = float(np.mean(filtered_lengths))
max_tlen = max(filtered_lengths)
min_tlen = min(filtered_lengths)

model = FragmentLengthModel(st_dev, mean, max_tlen, min_tlen)
model = FragmentLengthModel(mean, st_dev)
_LOG.info(f'Saving model: {output}')
with gzip.open(output, 'w+') as outfile:
pickle.dump(model, outfile)
Expand Down
10 changes: 7 additions & 3 deletions neat/model_sequencing_error/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import numpy as np
# TODO implement plotting
import matplotlib.pyplot as plt
import sys

from scipy.stats import mode
from ..common import open_input
Expand All @@ -31,7 +32,8 @@ def convert_quality_string(qual_str: str, offset: int):
try:
ret_list.append(ord(qual_str[i]) - offset)
except ValueError:
raise ValueError("improperly formatted fastq file")
_LOG.error("improperly formatted fastq file")
sys.exit(1)

return ret_list

Expand All @@ -45,7 +47,8 @@ def expand_counts(count_array: list, scores: list):
:return np.ndarray: a one-dimensional array reflecting the expanded count
"""
if len(count_array) != len(scores):
raise ValueError("Count array and scores have different lengths.")
_LOG.error("Count array and scores have different lengths.")
sys.exit(1)

ret_list = []
for i in range(len(count_array)):
Expand Down Expand Up @@ -89,7 +92,8 @@ def parse_file(input_file: str, quality_scores: list, max_reads: int, qual_offse
if readlen_mode.count < (0.5 * len(readlens)):
_LOG.warning("Highly variable read lengths detected. Results may be less than ideal.")
if readlen_mode.count < 20:
raise ValueError(f"Dataset is too scarce or inconsistent to make a model. Try a different input.")
_LOG.error(f"Dataset is too scarce or inconsistent to make a model. Try a different input.")
sys.exit(1)
read_length = int(readlen_mode.mode)

else:
Expand Down
21 changes: 17 additions & 4 deletions neat/models/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import re
import logging
import abc
import sys

from numpy.random import Generator
from Bio.Seq import Seq
Expand Down Expand Up @@ -235,7 +236,8 @@ def __init__(self,
self.homozygous_freq = homozygous_freq

if not np.isclose(sum(variant_probs.values()), 1):
raise ValueError("Probabilities do not add up to 1.")
_LOG.error("Probabilities do not add up to 1.")
sys.exit(1)

self.variant_probs = variant_probs
self.transition_matrix = transition_matrix
Expand Down Expand Up @@ -558,15 +560,26 @@ def __init__(self,
self.rng = rng

def generate_fragments(self,
total_length: int,
number_of_fragments: int) -> list:
"""
Generates a number of fragments based on the total length needed, and the mean and standard deviation of the set
:param total_length: Length of the reference segment we are covering.
:param number_of_fragments: The number of fragments needed.
:return: A list of fragment random fragment lengths sampled from the model.
"""
# Due to natural variation in genome lengths, it's difficult to harden this code against all the possible
# inputs. In order to harden this code against infinite loops caused by fragment means that the wrong size for
# the genome, we introduce a small number of standard fragments, to ensure enough variability that our code can
# complete. a few small fragments should increase the variability of the set. Most of these are too small
# to create a read, so they become spacers instead.
extra_fragments = [10, 11, 12, 13, 14, 28, 31]
# generates a distribution, assuming normality, then rounds the result and converts to ints
dist = np.round(self.rng.normal(self.fragment_mean, self.fragment_st_dev, size=number_of_fragments)).astype(int)
return [abs(x) for x in dist]
dist = [abs(x) for x in dist]
# We'll append enough fragments to pad out distribution and add variability. Don't know if the cost of doing
# this is worth it though.
number_extra = int(number_of_fragments * 0.1)
for i in range(number_extra):
dist.append(extra_fragments[i % len(extra_fragments)])
self.rng.shuffle(dist) # this shuffle mixes extra fragments in.
return dist[:number_of_fragments]
Loading

0 comments on commit f1c64b8

Please sign in to comment.