Merge pull request #615 from nextstrain/drop-backward-compatibility

Drop backward compatibility
nextstrain · May 7, 2021 · c7a707f · c7a707f
2 parents a9b66a2 + 98d51a6
commit c7a707f
Show file tree

Hide file tree

Showing 14 changed files with 264 additions and 461 deletions.
diff --git a/Snakefile b/Snakefile
@@ -8,6 +8,7 @@ from getpass import getuser
 from snakemake.logging import logger
 from snakemake.utils import validate
 from collections import OrderedDict
+import textwrap
 import time
 
 # Store the user's configuration prior to loading defaults, so we can check for
@@ -74,6 +75,26 @@ if "builds" not in config:
 
 include: "workflow/snakemake_rules/reference_build_definitions.smk"
 
+# Check for old-style input file references and alert users to the new format.
+if "sequences" in config or "metadata" in config:
+    logger.error("ERROR: Your configuration file includes references to an unsupported specification of input files (e.g., `config['sequences']` or `config['metadata']`).")
+    logger.error("Update your configuration file (e.g., 'builds.yaml') to define your inputs as follows and try running the workflow again:")
+    logger.error(textwrap.indent(
+        f"\ninputs:\n  name: local-data\n  metadata: {config['metadata']}\n  sequences: {config['sequences']}\n",
+        "  "
+    ))
+    sys.exit(1)
+
+# Check for missing inputs.
+if "inputs" not in config:
+    logger.error("ERROR: Your workflow does not define any input files to start with.")
+    logger.error("Update your configuration file (e.g., 'builds.yaml') to define at least one input dataset as follows and try running the workflow again:")
+    logger.error(textwrap.indent(
+        f"\ninputs:\n  name: local-data\n  metadata: data/example_metadata.tsv\n  sequences: data/example_sequences.fasta.gz\n",
+        "  "
+    ))
+    sys.exit(1)
+
 # Allow users to specify a list of active builds from the command line.
 if config.get("active_builds"):
     BUILD_NAMES = config["active_builds"].split(",")
@@ -93,7 +114,7 @@ wildcard_constraints:
     # but not special strings used for Nextstrain builds.
     build_name = r'(?:[_a-zA-Z-](?!(tip-frequencies)))+',
     date = r"[0-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9]",
-    origin = r"(_[a-zA-Z0-9-]+)?" # origin starts with an underscore _OR_ it's the empty string
+    origin = r"[a-zA-Z0-9-_]+"
 
 localrules: download_metadata, download_sequences, clean
 

diff --git a/defaults/parameters.yaml b/defaults/parameters.yaml
@@ -6,11 +6,6 @@
 # This must be a relative path to the top-level Snakefile directory (e.g., `ncov/`).
 conda_environment: "workflow/envs/nextstrain.yaml"
 
-# These are the two main starting files for the run.
-# If they do not exist, we will attempt to fetch them from a S3 bucket (see below)
-sequences: "data/sequences.fasta"
-metadata: "data/metadata.tsv"
-
 strip_strain_prefixes:
   - hCoV-19/
   - SARS-CoV-2/
@@ -36,6 +31,9 @@ files:
   clades: "defaults/clades.tsv"
   emerging_lineages: "defaults/emerging_lineages.tsv"
 
+# Define genes to translate during alignment by nextalign.
+genes: ["ORF1a", "ORF1b", "S", "ORF3a", "M", "N"]
+
 # Filter settings
 filter:
   # Require nearly full-length genomes.

diff --git a/docs/change_log.md b/docs/change_log.md
@@ -3,6 +3,21 @@
 As of April 2021, we use major version numbers (e.g. v2) to reflect backward incompatible changes to the workflow that likely require you to update your Nextstrain installation.
 We also use this change log to document new features that maintain backward compatibility, indicating these features by the date they were added.
 
+## v5 (7 May 2021)
+
+[See the corresponding pull request](https://github.com/nextstrain/ncov/pull/615) for more details about this release.
+
+### Major changes
+
+- Drop support for old sequence/metadata inputs
+- Use nextalign for alignment instead of mafft
+
+### Minor changes
+
+- Drop unused haplotype status rule and script
+- Remove unused nucleotide mutation frequencies rule
+- Use augur distance for mutation counts
+
 ## v4 (5 May 2021)
 
 [See the corresponding pull request](https://github.com/nextstrain/ncov/pull/605) for more details about changes in this release.

diff --git a/docs/multiple_inputs.md b/docs/multiple_inputs.md
@@ -51,15 +51,17 @@ my_profiles/example_multiple_inputs/my_auspice_config.json
 
 ## Setting up the config
 
-Typically, inside the `builds.yaml` one would specify input files such as
+You can define a single input dataset in `builds.yaml` as follows.
 
 ```yaml
-# traditional syntax for specifying starting files
-sequences: "data/sequences.fasta"
-metadata: "data/metadata.tsv"
+inputs:
+  - name: my-data
+    metadata: "data/metadata.tsv"
+    sequences: "data/sequences.fasta"
 ```
 
-For multiple inputs, we shall use the new `inputs` section of the config to specify that we have two different inputs, and we will give them the names "aus" and "worldwide":
+For multiple inputs, you can add another entry to the `inputs` config list.
+Here, we will give them the names "aus" and "worldwide":
 
 ```yaml
 # my_profiles/example_multiple_inputs/builds.yaml
@@ -72,15 +74,11 @@ inputs:
     sequences: "data/example_sequences_worldwide.fasta"
 ```
 
-> Note that if you also specify `sequences` or `metadata` as top level entries in the config, they will be ignored.
-
 ### Snakemake terminology
 
 Inside the Snakemake rules, we use a wildcard `origin` to define different starting points.
-For instance, if we ask for the file `results/aligned_worldwide.fasta` then `wildcards.origin="_worldwide"` and we expect that the config has defined
-a sequences input via `config["sequences"]["worldwide"]=<path to fasta>` (note the leading `_` has been stripped from the `origin` in the config).
-If we use the older syntax (specifying `sequences` or `metadata` as top level entries in the config) then `wildcards.origin=""`.
-
+For instance, if we ask for the file `results/aligned_worldwide.fasta` then `wildcards.origin="worldwide"` and we expect that the config has defined
+a sequences input as shown above.
 
 ## How is metadata combined?
 

diff --git a/my_profiles/example/builds.yaml b/my_profiles/example/builds.yaml
@@ -11,6 +11,12 @@
 
 # In this example, we use these default methods. See other templates for examples of how to customize this subsampling scheme.
 
+# Define input files.
+inputs:
+  - name: example-data
+    metadata: data/example_metadata.tsv
+    sequences: data/example_sequences.fasta
+
 builds:
   # Focus on King County (location) in Washington State (division) in the USA (country)
   # with a build name that will produce the following URL fragment on Nextstrain/auspice:

diff --git a/my_profiles/example/config.yaml b/my_profiles/example/config.yaml
@@ -10,10 +10,6 @@ configfile:
   - defaults/parameters.yaml # Pull in the default values
   - my_profiles/example/builds.yaml # Pull in our list of desired builds
 
-config:
-  - sequences=data/example_sequences.fasta
-  - metadata=data/example_metadata.tsv
-
 # Set the maximum number of cores you want Snakemake to use for this pipeline.
 cores: 2
 

diff --git a/my_profiles/getting_started/builds.yaml b/my_profiles/getting_started/builds.yaml
@@ -8,6 +8,12 @@
 # These subsample primarily from the area of interest ("focus"), and add in background ("contextual") sequences from the rest of the world.
 # Contextual sequences that are genetically similar to (hamming distance) and geographically near the focal sequences are heavily prioritized.
 
+# Define input files.
+inputs:
+  - name: example-data
+    metadata: data/example_metadata.tsv
+    sequences: data/example_sequences.fasta.gz
+
 # In this example, we use these default methods. See other templates for examples of how to customize this subsampling scheme.
 builds:
   # This build samples evenly from the globe

diff --git a/my_profiles/getting_started/config.yaml b/my_profiles/getting_started/config.yaml
@@ -10,10 +10,6 @@ configfile:
   - defaults/parameters.yaml # Pull in the default values
   - my_profiles/getting_started/builds.yaml # Pull in our list of desired builds
 
-config:
-  - sequences=data/example_sequences.fasta
-  - metadata=data/example_metadata.tsv
-
 # Set the maximum number of cores you want Snakemake to use for this pipeline.
 cores: 1
 

diff --git a/scripts/annotate-haplotype-status.py b/scripts/annotate-haplotype-status.py
diff --git a/scripts/mutation_counts.py b/scripts/mutation_counts.py
diff --git a/workflow/snakemake_rules/common.smk b/workflow/snakemake_rules/common.smk
@@ -25,89 +25,69 @@ def numeric_date(dt=None):
 
     return res
 
-def _trim_origin(origin):
-    """the origin wildcard includes a leading `_`. This function returns the value without this `_`"""
-    if origin=="":
-        return ""
-    return origin[1:]
-
 def _get_subsampling_scheme_by_build_name(build_name):
     return config["builds"][build_name].get("subsampling_scheme", build_name)
 
 def _get_filter_value(wildcards, key):
     default = config["filter"].get(key, "")
     if wildcards["origin"] == "":
         return default
-    return config["filter"].get(_trim_origin(wildcards["origin"]), {}).get(key, default)
+    return config["filter"].get(wildcards["origin"], {}).get(key, default)
 
 def _get_path_for_input(stage, origin_wildcard):
     """
     A function called to define an input for a Snakemake rule
     This function always returns a local filepath, the format of which decides whether rules should
     create this by downloading from a remote resource, or create it by a local compute rule.
     """
-    if not origin_wildcard:
-        # No origin wildcards => deprecated single inputs (e.g. `config["sequences"]`) which cannot
-        # be downloaded from remote resources
-        if config.get("inputs"):
-            raise Exception("ERROR: empty origin wildcard but config defines 'inputs`")
-        path_or_url = config[stage] if stage in ["metadata", "sequences"] else ""
-        remote = False
-    else:
-        trimmed_origin = _trim_origin(origin_wildcard)
-        path_or_url = config.get("inputs", {}).get(trimmed_origin, {}).get(stage, "")
-        scheme = urlsplit(path_or_url).scheme
-        remote = bool(scheme)
+    path_or_url = config.get("inputs", {}).get(origin_wildcard, {}).get(stage, "")
+    scheme = urlsplit(path_or_url).scheme
+    remote = bool(scheme)
 
-        # Following checking should be the remit of the rule which downloads the remote resource
-        if scheme and scheme!="s3":
-            raise Exception(f"Input defined scheme {scheme} which is not yet supported.")
+    # Following checking should be the remit of the rule which downloads the remote resource
+    if scheme and scheme!="s3":
+        raise Exception(f"Input defined scheme {scheme} which is not yet supported.")
 
-        ## Basic checking which could be taken care of by the config schema
-        ## If asking for metadata/sequences, the config _must_ supply a `path_or_url`
-        if path_or_url=="" and stage in ["metadata", "sequences"]:
-            raise Exception(f"ERROR: config->input->{trimmed_origin}->{stage} is not defined.")
+    ## Basic checking which could be taken care of by the config schema
+    ## If asking for metadata/sequences, the config _must_ supply a `path_or_url`
+    if path_or_url=="" and stage in ["metadata", "sequences"]:
+        raise Exception(f"ERROR: config->input->{origin_wildcard}->{stage} is not defined.")
 
     if stage=="metadata":
-        return f"data/downloaded{origin_wildcard}.tsv" if remote else path_or_url
+        return f"data/downloaded_{origin_wildcard}.tsv" if remote else path_or_url
     if stage=="sequences":
-        return f"data/downloaded{origin_wildcard}.fasta" if remote else path_or_url
+        return f"data/downloaded_{origin_wildcard}.fasta" if remote else path_or_url
     if stage=="aligned":
-        return f"results/precomputed-aligned{origin_wildcard}.fasta" if remote else f"results/aligned{origin_wildcard}.fasta"
+        return f"results/precomputed-aligned_{origin_wildcard}.fasta" if remote else f"results/aligned_{origin_wildcard}.fasta"
     if stage=="to-exclude":
-        return f"results/precomputed-to-exclude{origin_wildcard}.txt" if remote else f"results/to-exclude{origin_wildcard}.txt"
+        return f"results/precomputed-to-exclude_{origin_wildcard}.txt" if remote else f"results/to-exclude_{origin_wildcard}.txt"
     if stage=="masked":
-        return f"results/precomputed-masked{origin_wildcard}.fasta" if remote else f"results/masked{origin_wildcard}.fasta"
+        return f"results/precomputed-masked_{origin_wildcard}.fasta" if remote else f"results/masked_{origin_wildcard}.fasta"
     if stage=="filtered":
         if remote:
-            return f"results/precomputed-filtered{origin_wildcard}.fasta"
+            return f"results/precomputed-filtered_{origin_wildcard}.fasta"
         elif path_or_url:
             return path_or_url
         else:
-            return f"results/filtered{origin_wildcard}.fasta"
+            return f"results/filtered_{origin_wildcard}.fasta"
 
     raise Exception(f"_get_path_for_input with unknown stage \"{stage}\"")
 
 
 def _get_unified_metadata(wildcards):
     """
     Returns a single metadata file representing the input metadata file(s).
-    If there was only one supplied metadata file (e.g. the deprecated
-    `config["metadata"]` syntax, or one entry in the `config["inputs"] dict`)
+    If there was only one supplied metadata file in the `config["inputs"] dict`,
     then that file is returned. Else "results/combined_metadata.tsv" is returned
     which will run the `combine_input_metadata` rule to make it.
     """
-    if not config.get("inputs"):
-        return config["metadata"]
     if len(list(config["inputs"].keys()))==1:
-        return "results/sanitized_metadata{origin}.tsv".format(origin="_"+list(config["inputs"].keys())[0])
+        return "results/sanitized_metadata_{origin}.tsv".format(origin=list(config["inputs"].keys())[0])
     return "results/combined_metadata.tsv"
 
 def _get_unified_alignment(wildcards):
-    if not config.get("inputs"):
-        return "results/filtered.fasta"
     if len(list(config["inputs"].keys()))==1:
-        return _get_path_for_input("filtered", "_"+list(config["inputs"].keys())[0])
+        return _get_path_for_input("filtered", list(config["inputs"].keys())[0])
     return "results/combined_sequences_for_subsampling.fasta",
 
 def _get_metadata_by_build_name(build_name):