diff --git a/Snakefile b/Snakefile index acaa62f35..cc829401d 100644 --- a/Snakefile +++ b/Snakefile @@ -8,6 +8,7 @@ from getpass import getuser from snakemake.logging import logger from snakemake.utils import validate from collections import OrderedDict +import textwrap import time # Store the user's configuration prior to loading defaults, so we can check for @@ -74,6 +75,26 @@ if "builds" not in config: include: "workflow/snakemake_rules/reference_build_definitions.smk" +# Check for old-style input file references and alert users to the new format. +if "sequences" in config or "metadata" in config: + logger.error("ERROR: Your configuration file includes references to an unsupported specification of input files (e.g., `config['sequences']` or `config['metadata']`).") + logger.error("Update your configuration file (e.g., 'builds.yaml') to define your inputs as follows and try running the workflow again:") + logger.error(textwrap.indent( + f"\ninputs:\n name: local-data\n metadata: {config['metadata']}\n sequences: {config['sequences']}\n", + " " + )) + sys.exit(1) + +# Check for missing inputs. +if "inputs" not in config: + logger.error("ERROR: Your workflow does not define any input files to start with.") + logger.error("Update your configuration file (e.g., 'builds.yaml') to define at least one input dataset as follows and try running the workflow again:") + logger.error(textwrap.indent( + f"\ninputs:\n name: local-data\n metadata: data/example_metadata.tsv\n sequences: data/example_sequences.fasta.gz\n", + " " + )) + sys.exit(1) + # Allow users to specify a list of active builds from the command line. if config.get("active_builds"): BUILD_NAMES = config["active_builds"].split(",") @@ -93,7 +114,7 @@ wildcard_constraints: # but not special strings used for Nextstrain builds. build_name = r'(?:[_a-zA-Z-](?!(tip-frequencies)))+', date = r"[0-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9]", - origin = r"(_[a-zA-Z0-9-]+)?" # origin starts with an underscore _OR_ it's the empty string + origin = r"[a-zA-Z0-9-_]+" localrules: download_metadata, download_sequences, clean diff --git a/defaults/parameters.yaml b/defaults/parameters.yaml index 7e382e8cf..f0974186b 100644 --- a/defaults/parameters.yaml +++ b/defaults/parameters.yaml @@ -6,11 +6,6 @@ # This must be a relative path to the top-level Snakefile directory (e.g., `ncov/`). conda_environment: "workflow/envs/nextstrain.yaml" -# These are the two main starting files for the run. -# If they do not exist, we will attempt to fetch them from a S3 bucket (see below) -sequences: "data/sequences.fasta" -metadata: "data/metadata.tsv" - strip_strain_prefixes: - hCoV-19/ - SARS-CoV-2/ @@ -36,6 +31,9 @@ files: clades: "defaults/clades.tsv" emerging_lineages: "defaults/emerging_lineages.tsv" +# Define genes to translate during alignment by nextalign. +genes: ["ORF1a", "ORF1b", "S", "ORF3a", "M", "N"] + # Filter settings filter: # Require nearly full-length genomes. diff --git a/docs/change_log.md b/docs/change_log.md index a07f5083a..775db6ddc 100644 --- a/docs/change_log.md +++ b/docs/change_log.md @@ -3,6 +3,21 @@ As of April 2021, we use major version numbers (e.g. v2) to reflect backward incompatible changes to the workflow that likely require you to update your Nextstrain installation. We also use this change log to document new features that maintain backward compatibility, indicating these features by the date they were added. +## v5 (7 May 2021) + +[See the corresponding pull request](https://github.com/nextstrain/ncov/pull/615) for more details about this release. + +### Major changes + +- Drop support for old sequence/metadata inputs +- Use nextalign for alignment instead of mafft + +### Minor changes + +- Drop unused haplotype status rule and script +- Remove unused nucleotide mutation frequencies rule +- Use augur distance for mutation counts + ## v4 (5 May 2021) [See the corresponding pull request](https://github.com/nextstrain/ncov/pull/605) for more details about changes in this release. diff --git a/docs/multiple_inputs.md b/docs/multiple_inputs.md index a40646b2c..7ba784d43 100644 --- a/docs/multiple_inputs.md +++ b/docs/multiple_inputs.md @@ -51,15 +51,17 @@ my_profiles/example_multiple_inputs/my_auspice_config.json ## Setting up the config -Typically, inside the `builds.yaml` one would specify input files such as +You can define a single input dataset in `builds.yaml` as follows. ```yaml -# traditional syntax for specifying starting files -sequences: "data/sequences.fasta" -metadata: "data/metadata.tsv" +inputs: + - name: my-data + metadata: "data/metadata.tsv" + sequences: "data/sequences.fasta" ``` -For multiple inputs, we shall use the new `inputs` section of the config to specify that we have two different inputs, and we will give them the names "aus" and "worldwide": +For multiple inputs, you can add another entry to the `inputs` config list. +Here, we will give them the names "aus" and "worldwide": ```yaml # my_profiles/example_multiple_inputs/builds.yaml @@ -72,15 +74,11 @@ inputs: sequences: "data/example_sequences_worldwide.fasta" ``` -> Note that if you also specify `sequences` or `metadata` as top level entries in the config, they will be ignored. - ### Snakemake terminology Inside the Snakemake rules, we use a wildcard `origin` to define different starting points. -For instance, if we ask for the file `results/aligned_worldwide.fasta` then `wildcards.origin="_worldwide"` and we expect that the config has defined -a sequences input via `config["sequences"]["worldwide"]=` (note the leading `_` has been stripped from the `origin` in the config). -If we use the older syntax (specifying `sequences` or `metadata` as top level entries in the config) then `wildcards.origin=""`. - +For instance, if we ask for the file `results/aligned_worldwide.fasta` then `wildcards.origin="worldwide"` and we expect that the config has defined +a sequences input as shown above. ## How is metadata combined? diff --git a/my_profiles/example/builds.yaml b/my_profiles/example/builds.yaml index eea133e11..4ff505fd9 100644 --- a/my_profiles/example/builds.yaml +++ b/my_profiles/example/builds.yaml @@ -11,6 +11,12 @@ # In this example, we use these default methods. See other templates for examples of how to customize this subsampling scheme. +# Define input files. +inputs: + - name: example-data + metadata: data/example_metadata.tsv + sequences: data/example_sequences.fasta + builds: # Focus on King County (location) in Washington State (division) in the USA (country) # with a build name that will produce the following URL fragment on Nextstrain/auspice: diff --git a/my_profiles/example/config.yaml b/my_profiles/example/config.yaml index 560e90c71..42aa8503d 100644 --- a/my_profiles/example/config.yaml +++ b/my_profiles/example/config.yaml @@ -10,10 +10,6 @@ configfile: - defaults/parameters.yaml # Pull in the default values - my_profiles/example/builds.yaml # Pull in our list of desired builds -config: - - sequences=data/example_sequences.fasta - - metadata=data/example_metadata.tsv - # Set the maximum number of cores you want Snakemake to use for this pipeline. cores: 2 diff --git a/my_profiles/getting_started/builds.yaml b/my_profiles/getting_started/builds.yaml index f72a562c7..0dd5df7b0 100644 --- a/my_profiles/getting_started/builds.yaml +++ b/my_profiles/getting_started/builds.yaml @@ -8,6 +8,12 @@ # These subsample primarily from the area of interest ("focus"), and add in background ("contextual") sequences from the rest of the world. # Contextual sequences that are genetically similar to (hamming distance) and geographically near the focal sequences are heavily prioritized. +# Define input files. +inputs: + - name: example-data + metadata: data/example_metadata.tsv + sequences: data/example_sequences.fasta.gz + # In this example, we use these default methods. See other templates for examples of how to customize this subsampling scheme. builds: # This build samples evenly from the globe diff --git a/my_profiles/getting_started/config.yaml b/my_profiles/getting_started/config.yaml index 8b5fa831d..c0e97f068 100644 --- a/my_profiles/getting_started/config.yaml +++ b/my_profiles/getting_started/config.yaml @@ -10,10 +10,6 @@ configfile: - defaults/parameters.yaml # Pull in the default values - my_profiles/getting_started/builds.yaml # Pull in our list of desired builds -config: - - sequences=data/example_sequences.fasta - - metadata=data/example_metadata.tsv - # Set the maximum number of cores you want Snakemake to use for this pipeline. cores: 1 diff --git a/scripts/annotate-haplotype-status.py b/scripts/annotate-haplotype-status.py deleted file mode 100644 index cc5e54dac..000000000 --- a/scripts/annotate-haplotype-status.py +++ /dev/null @@ -1,36 +0,0 @@ -"""Annotate haplotype status for all sequences that match a given reference node name. -""" -import argparse -from augur.utils import write_json -import json -import sys - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument("--ancestral-sequences", required=True, help="node data JSON of nucleotide mutations including observed and inferred by TreeTime") - parser.add_argument("--reference-node-name", required=True, help="name of the node whose sequence flags the desired haplotype") - parser.add_argument("--attribute-name", default="haplotype_status", help="name of attribute for haplotype status") - parser.add_argument("--output", required=True, help="node data JSON with annotated haplotype status based on the given reference node's sequence") - - args = parser.parse_args() - - with open(args.ancestral_sequences, "r") as fh: - sequences = json.load(fh) - - if args.reference_node_name not in sequences["nodes"]: - print("ERROR: Could not find the requested reference node named '%s' in the given ancestral sequences." % args.reference_node_name, file=sys.stderr) - sys.exit(1) - - haplotype_sequence = sequences["nodes"][args.reference_node_name]["sequence"] - haplotype_status = {"nodes": {}} - - for node in sequences["nodes"]: - if sequences["nodes"][node]["sequence"] == haplotype_sequence: - status = "haplotype matches %s" % args.reference_node_name - else: - status = "haplotype does not match %s" % args.reference_node_name - - haplotype_status["nodes"][node] = {args.attribute_name: status} - - write_json(haplotype_status, args.output) diff --git a/scripts/mutation_counts.py b/scripts/mutation_counts.py deleted file mode 100644 index e1ed496f3..000000000 --- a/scripts/mutation_counts.py +++ /dev/null @@ -1,55 +0,0 @@ -import argparse, json -from Bio import Phylo, SeqIO -import pandas as pd -import numpy as np - -if __name__ == '__main__': - parser = argparse.ArgumentParser( - description="count distances based on distance maps with counting gaps as one event", - formatter_class=argparse.ArgumentDefaultsHelpFormatter - ) - - parser.add_argument('--tree', type=str, required=True, help="tree file") - parser.add_argument('--alignment', type=str) - parser.add_argument('--gene-names', nargs='+') - parser.add_argument('--compare-to', nargs='+') - parser.add_argument('--map', nargs='+') - parser.add_argument('--attribute-name', nargs='+') - parser.add_argument('--output', type=str, required=True, help="output json file") - args = parser.parse_args() - - if type(args.map)==str: - args.map = [args.map] - - if type(args.attribute_name)==str: - args.attribute_name = [args.attribute_name] - - node_data = {} - - T = Phylo.read(args.tree, 'newick') - aln = {s.name:str(s.seq) for s in SeqIO.parse(args.alignment, 'fasta')} - root = aln[T.root.name] - - for map_file, attribute in zip(args.map, args.attribute_name): - with open(map_file) as fh: - map = json.load(fh) - - weights = {int(k)-1:v for k,v in map["map"][args.gene_names[0]].items()} - default = map['default'] - for n in T.find_clades(): - if n.name not in node_data: node_data[n.name] = {} - distance = 0 - gap_extend=False - if n.name not in aln: - print(f"{n.name} not found in alignment") - continue - - for p, (a,d) in enumerate(zip(root, aln[n.name])): - if a!=d and (not gap_extend): - distance+=weights.get(p, default) - - gap_extend= d=='-' - node_data[n.name][f"{attribute}"] = distance - - with open(args.output, 'w') as fh: - json.dump({"nodes":node_data}, fh) diff --git a/workflow/snakemake_rules/common.smk b/workflow/snakemake_rules/common.smk index 76c4d37b2..aa31fd28e 100644 --- a/workflow/snakemake_rules/common.smk +++ b/workflow/snakemake_rules/common.smk @@ -25,12 +25,6 @@ def numeric_date(dt=None): return res -def _trim_origin(origin): - """the origin wildcard includes a leading `_`. This function returns the value without this `_`""" - if origin=="": - return "" - return origin[1:] - def _get_subsampling_scheme_by_build_name(build_name): return config["builds"][build_name].get("subsampling_scheme", build_name) @@ -38,7 +32,7 @@ def _get_filter_value(wildcards, key): default = config["filter"].get(key, "") if wildcards["origin"] == "": return default - return config["filter"].get(_trim_origin(wildcards["origin"]), {}).get(key, default) + return config["filter"].get(wildcards["origin"], {}).get(key, default) def _get_path_for_input(stage, origin_wildcard): """ @@ -46,45 +40,36 @@ def _get_path_for_input(stage, origin_wildcard): This function always returns a local filepath, the format of which decides whether rules should create this by downloading from a remote resource, or create it by a local compute rule. """ - if not origin_wildcard: - # No origin wildcards => deprecated single inputs (e.g. `config["sequences"]`) which cannot - # be downloaded from remote resources - if config.get("inputs"): - raise Exception("ERROR: empty origin wildcard but config defines 'inputs`") - path_or_url = config[stage] if stage in ["metadata", "sequences"] else "" - remote = False - else: - trimmed_origin = _trim_origin(origin_wildcard) - path_or_url = config.get("inputs", {}).get(trimmed_origin, {}).get(stage, "") - scheme = urlsplit(path_or_url).scheme - remote = bool(scheme) + path_or_url = config.get("inputs", {}).get(origin_wildcard, {}).get(stage, "") + scheme = urlsplit(path_or_url).scheme + remote = bool(scheme) - # Following checking should be the remit of the rule which downloads the remote resource - if scheme and scheme!="s3": - raise Exception(f"Input defined scheme {scheme} which is not yet supported.") + # Following checking should be the remit of the rule which downloads the remote resource + if scheme and scheme!="s3": + raise Exception(f"Input defined scheme {scheme} which is not yet supported.") - ## Basic checking which could be taken care of by the config schema - ## If asking for metadata/sequences, the config _must_ supply a `path_or_url` - if path_or_url=="" and stage in ["metadata", "sequences"]: - raise Exception(f"ERROR: config->input->{trimmed_origin}->{stage} is not defined.") + ## Basic checking which could be taken care of by the config schema + ## If asking for metadata/sequences, the config _must_ supply a `path_or_url` + if path_or_url=="" and stage in ["metadata", "sequences"]: + raise Exception(f"ERROR: config->input->{origin_wildcard}->{stage} is not defined.") if stage=="metadata": - return f"data/downloaded{origin_wildcard}.tsv" if remote else path_or_url + return f"data/downloaded_{origin_wildcard}.tsv" if remote else path_or_url if stage=="sequences": - return f"data/downloaded{origin_wildcard}.fasta" if remote else path_or_url + return f"data/downloaded_{origin_wildcard}.fasta" if remote else path_or_url if stage=="aligned": - return f"results/precomputed-aligned{origin_wildcard}.fasta" if remote else f"results/aligned{origin_wildcard}.fasta" + return f"results/precomputed-aligned_{origin_wildcard}.fasta" if remote else f"results/aligned_{origin_wildcard}.fasta" if stage=="to-exclude": - return f"results/precomputed-to-exclude{origin_wildcard}.txt" if remote else f"results/to-exclude{origin_wildcard}.txt" + return f"results/precomputed-to-exclude_{origin_wildcard}.txt" if remote else f"results/to-exclude_{origin_wildcard}.txt" if stage=="masked": - return f"results/precomputed-masked{origin_wildcard}.fasta" if remote else f"results/masked{origin_wildcard}.fasta" + return f"results/precomputed-masked_{origin_wildcard}.fasta" if remote else f"results/masked_{origin_wildcard}.fasta" if stage=="filtered": if remote: - return f"results/precomputed-filtered{origin_wildcard}.fasta" + return f"results/precomputed-filtered_{origin_wildcard}.fasta" elif path_or_url: return path_or_url else: - return f"results/filtered{origin_wildcard}.fasta" + return f"results/filtered_{origin_wildcard}.fasta" raise Exception(f"_get_path_for_input with unknown stage \"{stage}\"") @@ -92,22 +77,17 @@ def _get_path_for_input(stage, origin_wildcard): def _get_unified_metadata(wildcards): """ Returns a single metadata file representing the input metadata file(s). - If there was only one supplied metadata file (e.g. the deprecated - `config["metadata"]` syntax, or one entry in the `config["inputs"] dict`) + If there was only one supplied metadata file in the `config["inputs"] dict`, then that file is returned. Else "results/combined_metadata.tsv" is returned which will run the `combine_input_metadata` rule to make it. """ - if not config.get("inputs"): - return config["metadata"] if len(list(config["inputs"].keys()))==1: - return "results/sanitized_metadata{origin}.tsv".format(origin="_"+list(config["inputs"].keys())[0]) + return "results/sanitized_metadata_{origin}.tsv".format(origin=list(config["inputs"].keys())[0]) return "results/combined_metadata.tsv" def _get_unified_alignment(wildcards): - if not config.get("inputs"): - return "results/filtered.fasta" if len(list(config["inputs"].keys()))==1: - return _get_path_for_input("filtered", "_"+list(config["inputs"].keys())[0]) + return _get_path_for_input("filtered", list(config["inputs"].keys())[0]) return "results/combined_sequences_for_subsampling.fasta", def _get_metadata_by_build_name(build_name): diff --git a/workflow/snakemake_rules/download.smk b/workflow/snakemake_rules/download.smk index f7a0b09db..9dd9fc798 100644 --- a/workflow/snakemake_rules/download.smk +++ b/workflow/snakemake_rules/download.smk @@ -8,8 +8,8 @@ # rule: align # input: _get_path_for_input # ... -# will result in an input file looking like "results/aligned{origin}.fasta" or -# "results/download-aligned{origin}.fasta" (which one is chosen depends on the +# will result in an input file looking like "results/aligned_{origin}.fasta" or +# "results/download-aligned_{origin}.fasta" (which one is chosen depends on the # supplied `config`). In the latter case, `rule download_aligned` will be used. # See https://github.com/nextstrain/ncov/compare/remote-files for an example of # how we could leverage snakemake to do this without needing a separate rule! @@ -33,11 +33,11 @@ def _infer_decompression(input): rule download_sequences: message: "Downloading sequences from {params.address} -> {output.sequences}" output: - sequences = "data/downloaded{origin}.fasta" + sequences = "data/downloaded_{origin}.fasta" conda: config["conda_environment"] params: - address = lambda w: config["inputs"][_trim_origin(w.origin)]["sequences"], - deflate = lambda w: _infer_decompression(config["inputs"][_trim_origin(w.origin)]["sequences"]) + address = lambda w: config["inputs"][w.origin]["sequences"], + deflate = lambda w: _infer_decompression(config["inputs"][w.origin]["sequences"]) shell: """ aws s3 cp {params.address} - | {params.deflate} > {output.sequences:q} @@ -46,11 +46,11 @@ rule download_sequences: rule download_metadata: message: "Downloading metadata from {params.address} -> {output.metadata}" output: - metadata = "data/downloaded{origin}.tsv" + metadata = "data/downloaded_{origin}.tsv" conda: config["conda_environment"] params: - address = lambda w: config["inputs"][_trim_origin(w.origin)]["metadata"], - deflate = lambda w: _infer_decompression(config["inputs"][_trim_origin(w.origin)]["metadata"]), + address = lambda w: config["inputs"][w.origin]["metadata"], + deflate = lambda w: _infer_decompression(config["inputs"][w.origin]["metadata"]), shell: """ aws s3 cp {params.address} - | {params.deflate} > {output.metadata:q} @@ -59,11 +59,11 @@ rule download_metadata: rule download_aligned: message: "Downloading aligned fasta files from {params.address} -> {output.sequences}" output: - sequences = "results/precomputed-aligned{origin}.fasta" + sequences = "results/precomputed-aligned_{origin}.fasta" conda: config["conda_environment"] params: - address = lambda w: config["inputs"][_trim_origin(w.origin)]["aligned"], - deflate = lambda w: _infer_decompression(config["inputs"][_trim_origin(w.origin)]["aligned"]) + address = lambda w: config["inputs"][w.origin]["aligned"], + deflate = lambda w: _infer_decompression(config["inputs"][w.origin]["aligned"]) shell: """ aws s3 cp {params.address} - | {params.deflate} > {output.sequences:q} @@ -77,17 +77,17 @@ rule download_diagnostic: {params.to_exclude_address} -> {output.to_exclude} """ output: - diagnostics = "results/precomputed-sequence-diagnostics{origin}.tsv", - flagged = "results/precomputed-flagged-sequences{origin}.tsv", - to_exclude = "results/precomputed-to-exclude{origin}.txt" + diagnostics = "results/precomputed-sequence-diagnostics_{origin}.tsv", + flagged = "results/precomputed-flagged-sequences_{origin}.tsv", + to_exclude = "results/precomputed-to-exclude_{origin}.txt" conda: config["conda_environment"] params: # Only `to-exclude` is defined via the config, so we make some assumptions about the format of the other filenames - to_exclude_address = lambda w: config["inputs"][_trim_origin(w.origin)]["to-exclude"], - flagged_address = lambda w: config["inputs"][_trim_origin(w.origin)]["to-exclude"].replace(f'to-exclude{w.origin}.txt', f'flagged-sequences{w.origin}.tsv'), - diagnostics_address = lambda w: config["inputs"][_trim_origin(w.origin)]["to-exclude"].replace(f'to-exclude{w.origin}.txt', f'sequence-diagnostics{w.origin}.tsv'), + to_exclude_address = lambda w: config["inputs"][w.origin]["to-exclude"], + flagged_address = lambda w: config["inputs"][w.origin]["to-exclude"].replace(f'to-exclude{w.origin}.txt', f'flagged-sequences{w.origin}.tsv'), + diagnostics_address = lambda w: config["inputs"][w.origin]["to-exclude"].replace(f'to-exclude{w.origin}.txt', f'sequence-diagnostics{w.origin}.tsv'), # assume the compression is the same across all 3 addresses - deflate = lambda w: _infer_decompression(config["inputs"][_trim_origin(w.origin)]["to-exclude"]) + deflate = lambda w: _infer_decompression(config["inputs"][w.origin]["to-exclude"]) shell: """ aws s3 cp {params.to_exclude_address} - | {params.deflate} > {output.to_exclude:q} @@ -99,11 +99,11 @@ rule download_diagnostic: rule download_masked: message: "Downloading aligned & masked FASTA from {params.address} -> {output.sequences}" output: - sequences = "results/precomputed-masked{origin}.fasta" + sequences = "results/precomputed-masked_{origin}.fasta" conda: config["conda_environment"] params: - address = lambda w: config["inputs"][_trim_origin(w.origin)]["masked"], - deflate = lambda w: _infer_decompression(config["inputs"][_trim_origin(w.origin)]["masked"]) + address = lambda w: config["inputs"][w.origin]["masked"], + deflate = lambda w: _infer_decompression(config["inputs"][w.origin]["masked"]) shell: """ aws s3 cp {params.address} - | {params.deflate} > {output.sequences:q} @@ -113,12 +113,12 @@ rule download_masked: rule download_filtered: message: "Downloading pre-computed filtered alignment from {params.address} -> {output.sequences}" output: - sequences = "results/precomputed-filtered{origin}.fasta" + sequences = "results/precomputed-filtered_{origin}.fasta" conda: config["conda_environment"] params: - address = lambda w: config["inputs"][_trim_origin(w.origin)]["filtered"], - deflate = lambda w: _infer_decompression(config["inputs"][_trim_origin(w.origin)]["filtered"]) + address = lambda w: config["inputs"][w.origin]["filtered"], + deflate = lambda w: _infer_decompression(config["inputs"][w.origin]["filtered"]) shell: """ aws s3 cp {params.address} - | {params.deflate} > {output.sequences:q} - """ \ No newline at end of file + """ diff --git a/workflow/snakemake_rules/export_for_nextstrain.smk b/workflow/snakemake_rules/export_for_nextstrain.smk index 782f58051..b1dd901dd 100644 --- a/workflow/snakemake_rules/export_for_nextstrain.smk +++ b/workflow/snakemake_rules/export_for_nextstrain.smk @@ -85,9 +85,6 @@ rule export_all_regions: """ -rule all_mutation_frequencies: - input: expand("results/{build_name}/nucleotide_mutation_frequencies.json", build_name=BUILD_NAMES) - rule mutation_summary: message: "Summarizing {input.alignment}" input: @@ -97,14 +94,14 @@ rule mutation_summary: reference = config["files"]["alignment_reference"], genemap = config["files"]["annotation"] output: - mutation_summary = "results/mutation_summary{origin}.tsv" + mutation_summary = "results/mutation_summary_{origin}.tsv" log: - "logs/mutation_summary{origin}.txt" + "logs/mutation_summary_{origin}.txt" benchmark: - "benchmarks/mutation_summary{origin}.txt" + "benchmarks/mutation_summary_{origin}.txt" params: outdir = "results/translations", - basename = "seqs{origin}" + basename = "seqs_{origin}" conda: config["conda_environment"] shell: """ diff --git a/workflow/snakemake_rules/main_workflow.smk b/workflow/snakemake_rules/main_workflow.smk index a8ea9aecd..f9cacf084 100644 --- a/workflow/snakemake_rules/main_workflow.smk +++ b/workflow/snakemake_rules/main_workflow.smk @@ -2,13 +2,13 @@ rule sanitize_metadata: input: metadata=lambda wildcards: _get_path_for_input("metadata", wildcards.origin) output: - metadata="results/sanitized_metadata{origin}.tsv" + metadata="results/sanitized_metadata_{origin}.tsv" benchmark: - "benchmarks/sanitize_metadata{origin}.txt" + "benchmarks/sanitize_metadata_{origin}.txt" conda: config["conda_environment"] log: - "logs/sanitize_metadata{origin}.txt" + "logs/sanitize_metadata_{origin}.txt" params: strain_prefixes=config["strip_strain_prefixes"], shell: @@ -27,7 +27,7 @@ rule combine_input_metadata: Combining metadata files {input.metadata} -> {output.metadata} and adding columns to represent origin """ input: - metadata=expand("results/sanitized_metadata{origin}.tsv", origin=[f"_{origin}" if origin else "" for origin in config.get("inputs", "")]), + metadata=expand("results/sanitized_metadata_{origin}.tsv", origin=config.get("inputs")), output: metadata = "results/combined_metadata.tsv" params: @@ -42,103 +42,68 @@ rule combine_input_metadata: python3 scripts/combine_metadata.py --metadata {input.metadata} --origins {params.origins} --output {output.metadata} 2>&1 | tee {log} """ -if "use_nextalign" in config and config["use_nextalign"]: - rule align: - message: - """ - Aligning sequences to {input.reference} - - gaps relative to reference are considered real - """ - input: - sequences = lambda wildcards: _get_path_for_input("sequences", wildcards.origin), - genemap = config["files"]["annotation"], - reference = config["files"]["alignment_reference"] - output: - alignment = "results/aligned{origin}.fasta", - insertions = "results/insertions{origin}.tsv", - translations = expand("results/translations/seqs{{origin}}.gene.{gene}.fasta", gene=config.get('genes', ['S'])) - params: - outdir = "results/translations", - genes = ','.join(config.get('genes', ['S'])), - basename = "seqs{origin}", - strain_prefixes=config["strip_strain_prefixes"], - log: - "logs/align{origin}.txt" - benchmark: - "benchmarks/align{origin}.txt" - conda: config["conda_environment"] - threads: 8 - resources: - mem_mb=3000 - shell: - """ - python3 scripts/sanitize_sequences.py \ - --sequences {input.sequences} \ - --strip-prefixes {params.strain_prefixes:q} \ - --output /dev/stdout \ - | nextalign \ - --jobs={threads} \ - --reference {input.reference} \ - --genemap {input.genemap} \ - --genes {params.genes} \ - --sequences /dev/stdin \ - --output-dir {params.outdir} \ - --output-basename {params.basename} \ - --output-fasta {output.alignment} \ - --output-insertions {output.insertions} > {log} 2>&1 - """ -else: - rule align: - message: - """ - Aligning sequences from {input.sequences} to {input.reference} +rule align: + message: + """ + Aligning sequences to {input.reference} - gaps relative to reference are considered real - """ - input: - sequences = lambda wildcards: _get_path_for_input("sequences", wildcards.origin), - reference = config["files"]["alignment_reference"] - output: - alignment = "results/aligned{origin}.fasta" - log: - "logs/align{origin}.txt" - benchmark: - "benchmarks/align{origin}.txt" - threads: 16 - conda: config["conda_environment"] - params: - strain_prefixes=config["strip_strain_prefixes"], - shell: - """ - python3 scripts/sanitize_sequences.py \ - --sequences {input.sequences} \ - --strip-prefixes {params.strain_prefixes:q} \ - --output /dev/stdout \ - | mafft \ - --auto \ - --thread {threads} \ - --keeplength \ - --addfragments \ - /dev/stdin \ - {input.reference} > {output} 2> {log} - """ + """ + input: + sequences = lambda wildcards: _get_path_for_input("sequences", wildcards.origin), + genemap = config["files"]["annotation"], + reference = config["files"]["alignment_reference"] + output: + alignment = "results/aligned_{origin}.fasta", + insertions = "results/insertions_{origin}.tsv", + translations = expand("results/translations/seqs_{{origin}}.gene.{gene}.fasta", gene=config.get('genes', ['S'])) + params: + outdir = "results/translations", + genes = ','.join(config.get('genes', ['S'])), + basename = "seqs_{origin}", + strain_prefixes=config["strip_strain_prefixes"], + log: + "logs/align_{origin}.txt" + benchmark: + "benchmarks/align_{origin}.txt" + conda: config["conda_environment"] + threads: 8 + resources: + mem_mb=3000 + shell: + """ + python3 scripts/sanitize_sequences.py \ + --sequences {input.sequences} \ + --strip-prefixes {params.strain_prefixes:q} \ + --output /dev/stdout \ + | nextalign \ + --jobs={threads} \ + --reference {input.reference} \ + --genemap {input.genemap} \ + --genes {params.genes} \ + --sequences /dev/stdin \ + --output-dir {params.outdir} \ + --output-basename {params.basename} \ + --output-fasta {output.alignment} \ + --output-insertions {output.insertions} > {log} 2>&1 + """ rule diagnostic: message: "Scanning aligned sequences {input.alignment} for problematic sequences" input: alignment = lambda wildcards: _get_path_for_input("aligned", wildcards.origin), - metadata = "results/sanitized_metadata{origin}.tsv", + metadata = "results/sanitized_metadata_{origin}.tsv", reference = config["files"]["reference"] output: - diagnostics = "results/sequence-diagnostics{origin}.tsv", - flagged = "results/flagged-sequences{origin}.tsv", - to_exclude = "results/to-exclude{origin}.txt" + diagnostics = "results/sequence-diagnostics_{origin}.tsv", + flagged = "results/flagged-sequences_{origin}.tsv", + to_exclude = "results/to-exclude_{origin}.txt" log: - "logs/diagnostics{origin}.txt" + "logs/diagnostics_{origin}.txt" params: mask_from_beginning = config["mask"]["mask_from_beginning"], mask_from_end = config["mask"]["mask_from_end"] benchmark: - "benchmarks/diagnostics{origin}.txt" + "benchmarks/diagnostics_{origin}.txt" resources: # Memory use scales primarily with the size of the metadata file. mem_mb=lambda wildcards, input: 15 * int(input.metadata.size / 1024 / 1024) @@ -161,7 +126,7 @@ def _collect_exclusion_files(wildcards): # As well as the sequences flagged by the diagnostic step. # Note that we can skip the diagnostic step on a per-input (per-origin) basis. exclude_files = [ config["files"]["exclude"] ] - if not config["filter"].get(_trim_origin(wildcards["origin"]), {}).get("skip_diagnostics", False): + if not config["filter"].get(wildcards["origin"], {}).get("skip_diagnostics", False): exclude_files.append(_get_path_for_input("to-exclude", wildcards.origin)) return exclude_files @@ -176,11 +141,11 @@ rule mask: input: alignment = lambda w: _get_path_for_input("aligned", w.origin) output: - alignment = "results/masked{origin}.fasta" + alignment = "results/masked_{origin}.fasta" log: - "logs/mask{origin}.txt" + "logs/mask_{origin}.txt" benchmark: - "benchmarks/mask{origin}.txt" + "benchmarks/mask_{origin}.txt" params: mask_from_beginning = config["mask"]["mask_from_beginning"], mask_from_end = config["mask"]["mask_from_end"], @@ -207,16 +172,16 @@ rule filter: """ input: sequences = lambda wildcards: _get_path_for_input("masked", wildcards.origin), - metadata = "results/sanitized_metadata{origin}.tsv", + metadata = "results/sanitized_metadata_{origin}.tsv", # TODO - currently the include / exclude files are not input (origin) specific, but this is possible if we want include = config["files"]["include"], exclude = _collect_exclusion_files, output: - sequences = "results/filtered{origin}.fasta" + sequences = "results/filtered_{origin}.fasta" log: - "logs/filtered{origin}.txt" + "logs/filtered_{origin}.txt" benchmark: - "benchmarks/filter{origin}.txt" + "benchmarks/filter_{origin}.txt" params: min_length = lambda wildcards: _get_filter_value(wildcards, "min_length"), exclude_where = lambda wildcards: _get_filter_value(wildcards, "exclude_where"), @@ -350,7 +315,7 @@ rule combine_sequences_for_subsampling: Combine and deduplicate aligned & filtered FASTAs from multiple origins in preparation for subsampling. """ input: - lambda w: [_get_path_for_input("filtered", f"_{origin}") for origin in config.get("inputs", {})] + lambda w: [_get_path_for_input("filtered", origin) for origin in config.get("inputs", {})] output: "results/combined_sequences_for_subsampling.fasta" benchmark: @@ -551,74 +516,45 @@ rule combine_samples: --output-metadata {output.metadata} 2>&1 | tee {log} """ -if "use_nextalign" in config and config["use_nextalign"]: - rule build_align: - message: - """ - Aligning sequences to {input.reference} - - gaps relative to reference are considered real - """ - input: - sequences = rules.combine_samples.output.sequences, - genemap = config["files"]["annotation"], - reference = config["files"]["alignment_reference"] - output: - alignment = "results/{build_name}/aligned.fasta", - insertions = "results/{build_name}/insertions.tsv", - translations = expand("results/{{build_name}}/translations/aligned.gene.{gene}.fasta", gene=config.get('genes', ['S'])) - params: - outdir = "results/{build_name}/translations", - genes = ','.join(config.get('genes', ['S'])), - basename = "aligned" - log: - "logs/align_{build_name}.txt" - benchmark: - "benchmarks/align_{build_name}.txt" - conda: config["conda_environment"] - threads: 8 - resources: - mem_mb=3000 - shell: - """ - nextalign \ - --jobs={threads} \ - --reference {input.reference} \ - --genemap {input.genemap} \ - --genes {params.genes} \ - --sequences {input.sequences} \ - --output-dir {params.outdir} \ - --output-basename {params.basename} \ - --output-fasta {output.alignment} \ - --output-insertions {output.insertions} > {log} 2>&1 - """ -else: - rule build_align: - message: - """ - Aligning sequences from {input.sequences} to {input.reference} +rule build_align: + message: + """ + Aligning sequences to {input.reference} - gaps relative to reference are considered real - """ - input: - sequences = rules.combine_samples.output.sequences, - reference = config["files"]["alignment_reference"] - output: - alignment = "results/{build_name}/aligned.fasta" - log: - "logs/align_{build_name}.txt" - benchmark: - "benchmarks/align_{build_name}.txt" - threads: 16 - conda: config["conda_environment"] - shell: - """ - mafft \ - --auto \ - --thread {threads} \ - --keeplength \ - --addfragments \ - {input.sequences} \ - {input.reference} > {output} 2> {log} - """ + """ + input: + sequences = rules.combine_samples.output.sequences, + genemap = config["files"]["annotation"], + reference = config["files"]["alignment_reference"] + output: + alignment = "results/{build_name}/aligned.fasta", + insertions = "results/{build_name}/insertions.tsv", + translations = expand("results/{{build_name}}/translations/aligned.gene.{gene}.fasta", gene=config.get('genes', ['S'])) + params: + outdir = "results/{build_name}/translations", + genes = ','.join(config.get('genes', ['S'])), + basename = "aligned" + log: + "logs/align_{build_name}.txt" + benchmark: + "benchmarks/align_{build_name}.txt" + conda: config["conda_environment"] + threads: 8 + resources: + mem_mb=3000 + shell: + """ + nextalign \ + --jobs={threads} \ + --reference {input.reference} \ + --genemap {input.genemap} \ + --genes {params.genes} \ + --sequences {input.sequences} \ + --output-dir {params.outdir} \ + --output-basename {params.basename} \ + --output-fasta {output.alignment} \ + --output-insertions {output.insertions} > {log} 2>&1 + """ if "run_pangolin" in config and config["run_pangolin"]: rule run_pangolin: @@ -812,27 +748,6 @@ rule ancestral: --infer-ambiguous 2>&1 | tee {log} """ -rule haplotype_status: - message: "Annotating haplotype status relative to {params.reference_node_name}" - input: - nt_muts = rules.ancestral.output.node_data - output: - node_data = "results/{build_name}/haplotype_status.json" - log: - "logs/haplotype_status_{build_name}.txt" - benchmark: - "benchmarks/haplotype_status_{build_name}.txt" - params: - reference_node_name = config["reference_node_name"] - conda: config["conda_environment"] - shell: - """ - python3 scripts/annotate-haplotype-status.py \ - --ancestral-sequences {input.nt_muts} \ - --reference-node-name {params.reference_node_name:q} \ - --output {output.node_data} 2>&1 | tee {log} - """ - rule translate: message: "Translating amino acid sequences" input: @@ -886,57 +801,59 @@ rule aa_muts_explicit: --genes {params.genes} \ --output {output.node_data} 2>&1 | tee {log} """ -if "use_nextalign" in config and config["use_nextalign"]: - rule build_mutation_summary: - message: "Summarizing {input.alignment}" - input: - alignment = rules.build_align.output.alignment, - insertions = rules.build_align.output.insertions, - translations = rules.build_align.output.translations, - reference = config["files"]["alignment_reference"], - genemap = config["files"]["annotation"] - output: - mutation_summary = "results/{build_name}/mutation_summary.tsv" - log: - "logs/mutation_summary_{build_name}.txt" - params: - outdir = "results/{build_name}/translations", - basename = "aligned" - conda: config["conda_environment"] - shell: - """ - python3 scripts/mutation_summary.py \ - --alignment {input.alignment} \ - --insertions {input.insertions} \ - --directory {params.outdir} \ - --basename {params.basename} \ - --reference {input.reference} \ - --genemap {input.genemap} \ - --output {output.mutation_summary} 2>&1 | tee {log} - """ - rule distances: - input: - tree = rules.refine.output.tree, - alignments = "results/{build_name}/translations/aligned.gene.S_withInternalNodes.fasta", - distance_maps = ["defaults/distance_maps/S1.json"] - params: - genes = 'S', - comparisons = ['root'], - attribute_names = ['S1_mutations'] - output: - node_data = "results/{build_name}/distances.json" - shell: - """ - python scripts/mutation_counts.py \ - --tree {input.tree} \ - --alignment {input.alignments} \ - --gene-names {params.genes} \ - --compare-to {params.comparisons} \ - --attribute-name {params.attribute_names} \ - --map {input.distance_maps} \ - --output {output} - """ +rule build_mutation_summary: + message: "Summarizing {input.alignment}" + input: + alignment = rules.build_align.output.alignment, + insertions = rules.build_align.output.insertions, + translations = rules.build_align.output.translations, + reference = config["files"]["alignment_reference"], + genemap = config["files"]["annotation"] + output: + mutation_summary = "results/{build_name}/mutation_summary.tsv" + log: + "logs/mutation_summary_{build_name}.txt" + params: + outdir = "results/{build_name}/translations", + basename = "aligned" + conda: config["conda_environment"] + shell: + """ + python3 scripts/mutation_summary.py \ + --alignment {input.alignment} \ + --insertions {input.insertions} \ + --directory {params.outdir} \ + --basename {params.basename} \ + --reference {input.reference} \ + --genemap {input.genemap} \ + --output {output.mutation_summary} 2>&1 | tee {log} + """ + +rule distances: + input: + tree = rules.refine.output.tree, + alignments = "results/{build_name}/translations/aligned.gene.S_withInternalNodes.fasta", + distance_maps = ["defaults/distance_maps/S1.json"] + params: + genes = 'S', + comparisons = ['root'], + attribute_names = ['S1_mutations'] + output: + node_data = "results/{build_name}/distances.json" + conda: + config["conda_environment"] + shell: + """ + augur distance \ + --tree {input.tree} \ + --alignment {input.alignments} \ + --gene-names {params.genes} \ + --compare-to {params.comparisons} \ + --attribute-name {params.attribute_names} \ + --map {input.distance_maps} \ + --output {output} + """ rule traits: message: @@ -1179,42 +1096,6 @@ rule logistic_growth: --output {output.node_data} 2>&1 | tee {log} """ -rule nucleotide_mutation_frequencies: - message: "Estimate nucleotide mutation frequencies" - input: - alignment = rules.build_align.output.alignment, - metadata = _get_metadata_by_wildcards - output: - frequencies = "results/{build_name}/nucleotide_mutation_frequencies.json" - log: - "logs/nucleotide_mutation_frequencies_{build_name}.txt" - benchmark: - "benchmarks/nucleotide_mutation_frequencies_{build_name}.txt" - params: - min_date = config["frequencies"]["min_date"], - minimal_frequency = config["frequencies"]["minimal_frequency"], - pivot_interval = config["frequencies"]["pivot_interval"], - stiffness = config["frequencies"]["stiffness"], - inertia = config["frequencies"]["inertia"] - resources: - # Memory use scales primarily with the size of the metadata file. - mem_mb=lambda wildcards, input: 15 * int(input.metadata.size / 1024 / 1024) - conda: config["conda_environment"] - shell: - """ - augur frequencies \ - --method diffusion \ - --alignments {input.alignment} \ - --gene-names nuc \ - --metadata {input.metadata} \ - --min-date {params.min_date} \ - --minimal-frequency {params.minimal_frequency} \ - --pivot-interval {params.pivot_interval} \ - --stiffness {params.stiffness} \ - --inertia {params.inertia} \ - --output {output.frequencies} 2>&1 | tee {log} - """ - def export_title(wildcards): # TODO: maybe we could replace this with a config entry for full/human-readable build name? location_name = wildcards.build_name @@ -1314,7 +1195,7 @@ rule add_branch_labels: python3 ./scripts/add_branch_labels.py \ --input {input.auspice_json} \ --emerging-clades {input.emerging_clades} \ - --output {output.auspice_json} + --output {output.auspice_json} """ rule incorporate_travel_history: