Merge pull request #605 from nextstrain/support-gisaid-downloads

Support GISAID downloads
nextstrain · May 5, 2021 · fb8caeb · fb8caeb
2 parents 7a71f49 + e917368
commit fb8caeb
Show file tree

Hide file tree

Showing 19 changed files with 727 additions and 99 deletions.
diff --git a/Snakefile b/Snakefile
@@ -62,11 +62,13 @@ if len(overlapping_schemes) > 0:
     logger.warning("")
     time.sleep(5)
 
-# default build if none specified in config
+# Assign a default build if none are specified in the config. Users can define a
+# `default_build_name` in their builds config without assigning any other build
+# information. Otherwise, we use a generic name for the default build.
 if "builds" not in config:
     config["builds"] = {
-        "global": {
-            "subsampling_scheme": "region_global",
+        config.get("default_build_name", "default-build"): {
+            "subsampling_scheme": "all",
         }
     }
 
@@ -93,7 +95,7 @@ wildcard_constraints:
     date = r"[0-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9]",
     origin = r"(_[a-zA-Z0-9-]+)?" # origin starts with an underscore _OR_ it's the empty string
 
-localrules: download_metadata, download_sequences, download, upload, clean
+localrules: download_metadata, download_sequences, clean
 
 # Create a standard ncov build for auspice, by default.
 rule all:

diff --git a/data/example_metadata_aus.tsv.xz b/data/example_metadata_aus.tsv.xz
diff --git a/data/example_metadata_worldwide.tsv.xz b/data/example_metadata_worldwide.tsv.xz
diff --git a/data/example_sequences_aus.fasta.xz b/data/example_sequences_aus.fasta.xz
diff --git a/data/example_sequences_worldwide.fasta.xz b/data/example_sequences_worldwide.fasta.xz
diff --git a/data/references_metadata.tsv b/data/references_metadata.tsv
@@ -0,0 +1,2 @@
+strain	virus	gisaid_epi_isl	genbank_accession	date	region	country	division	location	region_exposure	country_exposure	division_exposure	segment	length	host	age	sex	originating_lab	submitting_lab	authors	url	title	date_submitted
+Wuhan/WH01/2019	ncov	EPI_ISL_406798	LR757998	2019-12-26	Asia	China	Hubei	Wuhan	Asia	China	Hubei	genome	29866	Human	44	Male	General Hospital of Central Theater Command of People's Liberation Army of China	BGI & Institute of Microbiology, Chinese Academy of Sciences & Shandong First Medical University & Shandong Academy of Medical Sciences & General Hospital of Central Theater Command of People's Liberation Army of China	Weijun Chen et al	https://www.gisaid.org	?	2020-01-30
diff --git a/data/references_sequences.fasta b/data/references_sequences.fasta
diff --git a/defaults/include.txt b/defaults/include.txt
@@ -1,3 +1,3 @@
 Wuhan/Hu-1/2019
 Wuhan-Hu-1/2019
-Wuhan/WH01/2019
+Wuhan/WH01/2019
diff --git a/defaults/parameters.yaml b/defaults/parameters.yaml
@@ -11,6 +11,10 @@ conda_environment: "workflow/envs/nextstrain.yaml"
 sequences: "data/sequences.fasta"
 metadata: "data/metadata.tsv"
 
+strip_strain_prefixes:
+  - hCoV-19/
+  - SARS-CoV-2/
+
 reference_node_name: "USA/WA1/2020"
 
 # Define files used for external configuration. Common examples consist of a
@@ -62,12 +66,19 @@ mask:
   # 13402, 24389 and 24390 are restricted to Belgian samples
   mask_sites: "13402 24389 24390"
 
+# Depending on how you define focal and contextual inputs, the same strains may
+# appear in both inputs. This option tells the workflow not to throw an error
+# when duplicates occur but to accept the first instance of a given strain's
+# sequences (e.g., the focal set).
+combine_sequences_for_subsampling:
+  warn_about_duplicates: true
+
 tree:
   tree-builder-args: "'-ninit 10 -n 4'"
 
 # TreeTime settings
 refine:
-  root: "Wuhan/Hu-1/2019 Wuhan/WH01/2019" #EPI_ISL_402125  EPI_ISL_406798
+  root: "Wuhan/WH01/2019"
   clock_rate: 0.0008
   clock_std_dev: 0.0004
   coalescent: "skyline"
@@ -141,6 +152,11 @@ exposure:
 # attribute whose value is the name of an entry in the subsampling configuration
 # below or defined elsewhere in another configuration file.
 subsampling:
+  # Default subsampling logic to select all strains from all inputs (i.e., no subsampling).
+  all:
+    all:
+      no_subsampling: true
+
   # Default subsampling logic for regions
   region:
     # Focal samples for region

diff --git a/docs/change_log.md b/docs/change_log.md
@@ -3,6 +3,22 @@
 As of April 2021, we use major version numbers (e.g. v2) to reflect backward incompatible changes to the workflow that likely require you to update your Nextstrain installation.
 We also use this change log to document new features that maintain backward compatibility, indicating these features by the date they were added.
 
+## v4 (5 May 2021)
+
+[See the corresponding pull request](https://github.com/nextstrain/ncov/pull/605) for more details about changes in this release.
+
+### Major changes
+
+- Change the default build name from "global" to "default-build" and use a default subsampling scheme that selects all input sequences
+- Warn about duplicate sequences found when merging sequences from multiple inputs instead of throwing an error (set `combine_sequences_for_subsampling: warn_about_duplicates: false` in your configuration file to revert this behavior)
+
+### Features
+
+- Define a new subsampling scheme named `all` that selects all input sequences
+- Add a new top-level configuration parameter `default_build_name` to allow overriding new default name of "default-build"
+- Support compressed sequence inputs for alignment with mafft and nextalign (requires mafft upgrade)
+- Sanitize strain names in sequences and metadata from different sources (e.g., `hCoV-19/` from GISAID or `SARS-CoV-2/` from GenBank, etc.)
+
 ## New features since last version update
 
 - 20 April 2021: Surface emerging lineage as a colorby. This replaces the rather stale color by "Emerging Clade" with a new color by "Emerging Lineage". This focuses on PANGO lineages that are of interest triangulated by [CoVariants](https://covariants.org/), [PANGO](https://cov-lineages.org/) international lineage reports, [CDC](https://www.cdc.gov/coronavirus/2019-ncov/cases-updates/variant-surveillance/variant-info.html) VUIs and VOCs and [PHE](https://assets.publishing.service.gov.uk/government/uploads/system/uploads/attachment_data/file/975742/Variants_of_Concern_VOC_Technical_Briefing_8_England.pdf) VUIs and VOCs. The intention is for the listing at `emerging_lineages.tsv` to be updated frequently with new lineages added and no longer interesting lineages dropped. [#609](https://github.com/nextstrain/ncov/pull/609)

diff --git a/my_profiles/example_global_context/builds.yaml b/my_profiles/example_global_context/builds.yaml
@@ -1,56 +1,47 @@
 
-## This YAML file is sparsely commented, with a focus on the parts relevant to multiple inputs
-## See my_profiles/example/builds.yaml for more general comments
-## See docs/multiple_inputs.md for a walkthrough of this config.
-
-# custom_rules:
-#   - my_profiles/example_multiple_inputs/rules.smk
-
-use_nextalign: true
-
+# Define an ordered list of input datasets including a "focal" set (e.g., a
+# custom selection of strains from GISAID's "custom selection" or search
+# interfaces) and a "contextual" set (e.g., a curated selection of strains
+# representing global or regional circulation at a specific point in time from
+# GISAID's "nextregion" downloads).
 inputs:
-  - name: local
-    metadata: "data/example_metadata.tsv"
-    sequences: "data/example_sequences.fasta"
-  - name: global
-    # Define local paths to a pre-filtered global context.
-    metadata: "data/global_subsampled_metadata.tsv.gz"
-    filtered: "data/global_subsampled_sequences.fasta.gz"
-    # Paths to files on S3 also work. For example:
-    #metadata: "s3://nextstrain-ncov-private/global_subsampled_metadata.tsv.xz"
-    #filtered: "s3://nextstrain-ncov-private/global_subsampled_sequences.fasta.xz"
-    # TODO: ideally, we could support something like the following, to inject
-    # a collection of contextual sequences into the analysis after all other
-    # subsampling.
-    # subsampled: "data/global_subsampled_sequences.fasta.gz"
-
+  - name: focal
+    metadata: "data/example_metadata_aus.tsv.xz"
+    sequences: "data/example_sequences_aus.fasta.xz"
+  - name: contextual
+    metadata: "data/example_metadata_worldwide.tsv.xz"
+    filtered: "data/example_sequences_worldwide.fasta.xz"
+
+# Define a single build named "global".
 builds:
-  global-context:
-    subsampling_scheme: custom-scheme # use a custom subsampling scheme defined below
+  global:
+    # Use a custom subsampling scheme defined below
+    subsampling_scheme: custom-scheme
 
-# STAGE 1: Input-specific filtering parameters
+# Align sequences with nextalign instead of mafft.
+use_nextalign: true
+
+# Input-specific filtering parameters.
 filter:
-  aus:
-    min_length: 5000 # Allow shorter genomes. Parameter used to filter alignment.
-    skip_diagnostics: True # skip diagnostics (which can remove genomes) for this input
+  focal:
+    # Allow shorter genomes. Parameter used to filter alignment.
+    min_length: 5000
+
+    # Skip diagnostics (which can remove genomes) for this input.
+    skip_diagnostics: True
 
-# STAGE 2: Subsampling parameters
+# Subsampling parameters
 subsampling:
   custom-scheme:
-    # Use metadata key to include ALL from `input1`
-    australian_focus:
-      exclude: "--exclude-where 'aus!=yes'" # subset to sequences from input `aus`
-    # Include all global contextual sequences without subsampling.
-    global_context:
-      exclude: "--exclude-where 'aus=yes'" # i.e. subset to sequences _not_ from input `aus`
-
-files:
-  auspice_config: "my_profiles/example_global_context/my_auspice_config.json"
-  description: "my_profiles/example_global_context/my_description.md"
+    # Subsample from the focal sequences. Remove or comment out the
+    # `max_sequences` line to select all focal sequences.
+    focal:
+      max_sequences: 100
+      query: --query "focal == 'yes'"
+    # Subsample from the contextual sequences. Remove or comment out the
+    # `max_sequences` line to select all contextual sequences.
+    contextual:
+      max_sequences: 100
+      query: --query "contextual == 'yes'"
 
 skip_travel_history_adjustment: True
-
-traits:
-  global-context:
-    sampling_bias_correction: 2.5
-    columns: ["country"]
diff --git a/scripts/combine-and-dedup-fastas.py b/scripts/combine-and-dedup-fastas.py
@@ -13,6 +13,7 @@
     )
 
     parser.add_argument('--input', type=str,  nargs="+", metavar="FASTA", required=True, help="input FASTAs")
+    parser.add_argument('--warn-about-duplicates', action="store_true", help="warn the user about duplicate strains instead of exiting with an error. The output will include the first occurrence of a duplicate sequence.")
     parser.add_argument('--output', type=str, metavar="FASTA", required=True, help="output FASTA")
     args = parser.parse_args()
 
@@ -48,11 +49,26 @@
             SeqIO.write(record, output_handle, 'fasta')
 
     if len(duplicate_strains) > 0:
+        error_mode = "ERROR"
+        exit_code = 1
+
+        if args.warn_about_duplicates:
+            error_mode = "WARNING"
+            exit_code = 0
+
         print(
-            "ERROR: Detected the following duplicate input strains with different sequences:",
+            f"{error_mode}: Detected the following duplicate input strains with different sequences:",
             file=sys.stderr
         )
         for strain in duplicate_strains:
             print(textwrap.indent(strain, "    "), file=sys.stderr)
 
-        sys.exit(1)
+        if not args.warn_about_duplicates:
+            print(
+                "Use the `--warn-about-duplicates` flag, if you prefer to accept",
+                "the first occurrence of duplicate sequences based on the order",
+                "of the given input sequences.",
+                file=sys.stderr
+            )
+
+        sys.exit(exit_code)
diff --git a/scripts/construct-recency-from-submission-date.py b/scripts/construct-recency-from-submission-date.py
@@ -37,7 +37,7 @@ def get_recency(date_str, ref_date):
     ref_date = datetime.now()
 
     for strain, d in meta.items():
-        if 'date_submitted' in d and d['date_submitted']:
+        if 'date_submitted' in d and d['date_submitted'] and d['date_submitted'] != "undefined":
             node_data['nodes'][strain] = {'recency': get_recency(d['date_submitted'], ref_date)}
 
     with open(args.output, 'wt') as fh:

diff --git a/scripts/sanitize_metadata.py b/scripts/sanitize_metadata.py
@@ -0,0 +1,34 @@
+import argparse
+from augur.utils import read_metadata
+import pandas as pd
+import re
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument("--metadata", required=True, help="metadata to be sanitized")
+    parser.add_argument("--strip-prefixes", nargs="+", help="prefixes to strip from strain names in the metadata")
+    parser.add_argument("--output", required=True, help="sanitized metadata")
+
+    args = parser.parse_args()
+
+    metadata, columns = read_metadata(args.metadata)
+    metadata = pd.DataFrame.from_dict(metadata, orient="index")
+
+    if args.strip_prefixes:
+        prefixes = "|".join(args.strip_prefixes)
+        pattern = f"^({prefixes})"
+
+        metadata["strain"] = metadata["strain"].apply(
+            lambda strain: re.sub(
+                pattern,
+                "",
+                strain
+            )
+        )
+
+    metadata.to_csv(
+        args.output,
+        sep="\t",
+        index=False
+    )
diff --git a/scripts/sanitize_sequences.py b/scripts/sanitize_sequences.py
@@ -0,0 +1,23 @@
+import argparse
+from augur.io import open_file, read_sequences, write_sequences
+import re
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument("--sequences", nargs="+", required=True, help="sequences to be sanitized")
+    parser.add_argument("--strip-prefixes", nargs="+", help="prefixes to strip from strain names in the sequences")
+    parser.add_argument("--output", required=True, help="sanitized sequences")
+
+    args = parser.parse_args()
+
+    if args.strip_prefixes:
+        prefixes = "|".join(args.strip_prefixes)
+        pattern = f"^({prefixes})"
+    else:
+        pattern = ""
+
+    with open_file(args.output, "w") as output_handle:
+        for sequence in read_sequences(*args.sequences):
+            sequence.id = re.sub(pattern, "", sequence.id)
+            write_sequences(sequence, output_handle)
diff --git a/workflow/envs/nextstrain.yaml b/workflow/envs/nextstrain.yaml
@@ -4,24 +4,10 @@ channels:
   - bioconda
   - defaults
 dependencies:
-  - cvxopt
-  - datrie
-  - fasttree
-  - iqtree
-  - mafft=7.471
-  - nextalign=0.1.6
-  - pandas
+  - augur=12.0.0
+  - iqtree=2.1.2
+  - mafft=7.475
+  - nextalign=0.2.0
   - pangolin=2.3.8
   - pangolearn=2021.04.01
-  - psutil
-  - python=3.7*
-  - nodejs=10
-  - raxml
-  - vcftools
-  - git
-  - pip
-  - pip:
-      - awscli==1.18.45
-      - nextstrain-augur==11.3.0
-      - nextstrain-cli==1.16.5
-      - rethinkdb==2.3.0.post6
+  - python>=3.7*
diff --git a/workflow/snakemake_rules/common.smk b/workflow/snakemake_rules/common.smk
@@ -100,7 +100,7 @@ def _get_unified_metadata(wildcards):
     if not config.get("inputs"):
         return config["metadata"]
     if len(list(config["inputs"].keys()))==1:
-        return _get_path_for_input("metadata", "_"+list(config["inputs"].keys())[0])
+        return "results/sanitized_metadata{origin}.tsv".format(origin="_"+list(config["inputs"].keys())[0])
     return "results/combined_metadata.tsv"
 
 def _get_unified_alignment(wildcards):

diff --git a/workflow/snakemake_rules/download.smk b/workflow/snakemake_rules/download.smk
@@ -49,10 +49,11 @@ rule download_metadata:
         metadata = "data/downloaded{origin}.tsv"
     conda: config["conda_environment"]
     params:
-        address = lambda w: config["inputs"][_trim_origin(w.origin)]["metadata"]
+        address = lambda w: config["inputs"][_trim_origin(w.origin)]["metadata"],
+        deflate = lambda w: _infer_decompression(config["inputs"][_trim_origin(w.origin)]["metadata"]),
     shell:
         """
-        aws s3 cp {params.address} - | gunzip -cq >{output.metadata:q}
+        aws s3 cp {params.address} - | {params.deflate} > {output.metadata:q}
         """
 
 rule download_aligned: