Merge pull request #842 from nextstrain/nextclade-align

Nextclade align
nextstrain · Feb 3, 2022 · c5699a6 · c5699a6
2 parents 59da03e + 46fa71e
commit c5699a6
Show file tree

Hide file tree

Showing 6 changed files with 204 additions and 13 deletions.
diff --git a/defaults/parameters.yaml b/defaults/parameters.yaml
@@ -37,8 +37,7 @@ sanitize_metadata:
     - "Lineage=pango_lineage"
     - "Pangolin version=pangolin_version"
     - "Variant=variant"
-    - "AA Substitutions=aa_substitutions"
-    - "aaSubstitutions=aa_substitutions"
+    - "AA Substitutions=aaSubstitutions"
     - "Submission date=date_submitted"
     - "Is reference?=is_reference"
     - "Is complete?=is_complete"

diff --git a/docs/src/reference/change_log.md b/docs/src/reference/change_log.md
@@ -3,6 +3,10 @@
 As of April 2021, we use major version numbers (e.g. v2) to reflect backward incompatible changes to the workflow that likely require you to update your Nextstrain installation.
 We also use this change log to document new features that maintain backward compatibility, indicating these features by the date they were added.
 
+## v11 (3 February 2022)
+
+- Run Nextclade QC and filtering on the final sample set before building a tree. Nextclade also runs `nextalign` under the hood. Importantly, this enables filtering the final sample set to omit strains with many reversions and/or possible contaminants, significantly improving the quality of Omicron trees. [See the original pull request for more details](https://github.com/nextstrain/ncov/pull/842). To disable this filtering by Nextclade quality control metrics, set `skip_diagnostics: true` in [the `filter` section of your build configuration file](https://docs.nextstrain.org/projects/ncov/en/latest/reference/configuration.html#filter).
+
 ## New features since last version update
 
 - 29 January 2022: Update "mutational fitness" coloring based on latest results from [Obermeyer et al model](https://www.medrxiv.org/content/10.1101/2021.09.07.21263228v1) via [github.com/broadinstitute/pyro-cov/](https://github.com/broadinstitute/pyro-cov/blob/master/paper/mutations.tsv).

diff --git a/docs/src/reference/configuration.md b/docs/src/reference/configuration.md
@@ -209,6 +209,11 @@ Builds support any named attributes that can be referenced by subsampling scheme
 * description: Minimum collection date for strains to include in the analysis used by `augur filter --min-date`. Dates can be numeric floating point values (e.g., `2019.74`) or ISO 8601-style strings (e.g., `2019-10-01`).
 * default: `2019.74`
 
+### skip_diagnostics
+* type: boolean
+* description: Skip filtering by Nextclade quality control metrics like clock rate deviation, number of SNP clusters, possible contaminations, etc.
+* default: `false`
+
 ## frequencies
 ### min_date
 * type: float or string

diff --git a/scripts/join-metadata-and-clades.py b/scripts/join-metadata-and-clades.py
@@ -0,0 +1,147 @@
+#!/usr/bin/env python3
+"""
+Copied from https://github.com/nextstrain/ncov-ingest/blob/master/bin/join-metadata-and-clades
+"""
+import argparse
+import sys
+from datetime import datetime
+import pandas as pd
+import numpy as np
+
+INSERT_BEFORE_THIS_COLUMN = "pango_lineage"
+METADATA_JOIN_COLUMN_NAME = 'strain'
+NEXTCLADE_JOIN_COLUMN_NAME = 'seqName'
+VALUE_MISSING_DATA = '?'
+
+rate_per_day = 0.0007 * 29903 / 365
+reference_day = datetime(2020,1,1).toordinal()
+
+column_map = {
+    "clade": "Nextstrain_clade",
+    "totalMissing": "missing_data",
+    "totalSubstitutions": "divergence",
+    "totalNonACGTNs": "nonACGTN",
+    "privateNucMutations.totalUnlabeledSubstitutions":  "rare_mutations",
+    "privateNucMutations.totalReversionSubstitutions": "reversion_mutations",
+    "privateNucMutations.totalLabeledSubstitutions": "potential_contaminants",
+    "qc.missingData.status": "QC_missing_data",
+    "qc.mixedSites.status": "QC_mixed_sites",
+    "qc.privateMutations.status": "QC_rare_mutations",
+    "qc.snpClusters.status": "QC_snp_clusters",
+    "qc.frameShifts.status": "QC_frame_shifts",
+    "qc.stopCodons.status": "QC_stop_codons",
+    "frameShifts": "frame_shifts",
+    "deletions": "deletions",
+    "insertions": "insertions",
+    "substitutions": "substitutions",
+    "aaSubstitutions": "aaSubstitutions"
+}
+
+preferred_types = {
+    "divergence": "int32",
+    "nonACGTN": "int32",
+    "missing_data": "int32",
+    "snp_clusters": "int32",
+    "rare_mutations": "int32"
+}
+
+def reorder_columns(result: pd.DataFrame):
+    """
+    Moves the new clade column after a specified column
+    """
+    columns = list(result.columns)
+    columns.remove(column_map['clade'])
+    insert_at = columns.index(INSERT_BEFORE_THIS_COLUMN)
+    columns.insert(insert_at, column_map['clade'])
+    return result[columns]
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="Joins metadata file with Nextclade clade output",
+    )
+    parser.add_argument("first_file")
+    parser.add_argument("second_file")
+    parser.add_argument("-o", default=sys.stdout)
+    return parser.parse_args()
+
+def datestr_to_ordinal(x):
+    try:
+        return datetime.strptime(x,"%Y-%m-%d").toordinal()
+    except:
+        return np.nan
+
+def isfloat(value):
+  try:
+    float(value)
+    return True
+  except ValueError:
+    return False
+
+def main():
+    args = parse_args()
+
+    metadata = pd.read_csv(args.first_file, index_col=METADATA_JOIN_COLUMN_NAME,
+                           sep='\t', low_memory=False)
+
+    # Check for existing annotations in the given metadata. Skip join with
+    # Nextclade QC file, if those annotations already exist and none of the
+    # columns have empty values. In the case where metadata were combined from
+    # different sources with and without annotations, the "clock_deviation"
+    # column will exist but some values will be missing. We handle this case as
+    # if the annotations do not exist at all and reannotate all columns. We
+    # cannot look for missing values across all expected columns as evidence of
+    # incomplete annotations, since a complete annotation by Nextclade will
+    # include missing values for some columns by design.
+    expected_columns = list(column_map.values()) + ["clock_deviation"]
+    existing_annotation_columns = metadata.columns.intersection(expected_columns)
+    if len(existing_annotation_columns) == len(expected_columns):
+        if metadata["clock_deviation"].isnull().sum() == 0:
+            print(f"Metadata file '{args.first_file}' has already been annotated with Nextclade QC columns. Skipping re-annotation.")
+            metadata.to_csv(args.o, sep="\t")
+            return
+
+    # Read and rename clade column to be more descriptive
+    clades = pd.read_csv(args.second_file, index_col=NEXTCLADE_JOIN_COLUMN_NAME,
+                         sep='\t', low_memory=False, na_filter = False) \
+            .rename(columns=column_map)
+
+    clade_columns = clades.columns.intersection(list(column_map.values()))
+    clades = clades[clade_columns]
+
+    # Concatenate on columns
+    result = pd.merge(
+        metadata, clades,
+        left_index=True,
+        right_index=True,
+        how='left',
+        suffixes=["_original", ""],
+    )
+    all_clades = result.Nextstrain_clade.unique()
+    t = result["date"].apply(datestr_to_ordinal)
+    div_array = np.array([float(x) if isfloat(x) else np.nan for x in result.divergence])
+    offset_by_clade = {}
+    for clade in all_clades:
+        ind = result.Nextstrain_clade==clade
+        if ind.sum()>100:
+            deviation = div_array[ind] - (t[ind] - reference_day)*rate_per_day
+            offset_by_clade[clade] = np.mean(deviation[~np.isnan(deviation)])
+
+    # extract divergence, time and offset information into vectors or series
+    offset = result["Nextstrain_clade"].apply(lambda x: offset_by_clade.get(x, 2.0))
+    # calculate divergence
+    result["clock_deviation"] = np.array(div_array - ((t-reference_day)*rate_per_day + offset), dtype=int)
+    result.loc[np.isnan(div_array)|np.isnan(t), "clock_deviation"] = np.nan
+
+    for col in list(column_map.values()) + ["clock_deviation"]:
+        result[col] = result[col].fillna(VALUE_MISSING_DATA)
+
+    # Move the new column so that it's next to other clade columns
+    if INSERT_BEFORE_THIS_COLUMN in result.columns:
+        result = reorder_columns(result) #.astype(preferred_types)
+
+    result.to_csv(args.o, index_label=METADATA_JOIN_COLUMN_NAME, sep='\t')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/workflow/envs/nextstrain.yaml b/workflow/envs/nextstrain.yaml
@@ -8,6 +8,7 @@ dependencies:
   - epiweeks=2.1.2
   - iqtree=2.1.4-beta
   - nextalign=1.9.0
+  - nextclade=1.10.1
   - pangolin=3.1.17
   - pangolearn=2021.12.06
   - python>=3.7*
diff --git a/workflow/snakemake_rules/main_workflow.smk b/workflow/snakemake_rules/main_workflow.smk
@@ -444,19 +444,35 @@ rule combine_samples:
             --output-metadata {output.metadata} 2>&1 | tee {log}
         """
 
+rule prepare_nextclade:
+    message:
+        """
+        Downloading reference files for nextclade (used for alignment and qc).
+        """
+    output:
+        nextclade_dataset = directory("data/sars-cov-2-nextclade-defaults"),
+    params:
+        name = "sars-cov-2",
+    shell:
+        """
+        nextclade dataset get --name {params.name} --output-dir {output.nextclade_dataset}
+        """
+
 rule build_align:
     message:
         """
-        Aligning sequences to {input.reference}
+        Running nextclade QC and aligning sequences to {input.reference}
             - gaps relative to reference are considered real
         """
     input:
         sequences = rules.combine_samples.output.sequences,
         genemap = config["files"]["annotation"],
-        reference = config["files"]["alignment_reference"]
+        reference = config["files"]["alignment_reference"],
+        nextclade_dataset = rules.prepare_nextclade.output.nextclade_dataset,
     output:
         alignment = "results/{build_name}/aligned.fasta",
         insertions = "results/{build_name}/insertions.tsv",
+        nextclade_qc = 'results/{build_name}/nextclade_qc.tsv',
         translations = expand("results/{{build_name}}/translations/aligned.gene.{gene}.fasta", gene=config.get('genes', ['S']))
     params:
         outdir = "results/{build_name}/translations",
@@ -472,22 +488,41 @@ rule build_align:
         mem_mb=3000
     shell:
         """
-        xz -c -d {input.sequences} | nextalign \
-            --jobs={threads} \
+        xz -c -d {input.sequences} |  nextclade run \
+            --jobs {threads} \
+            --input-fasta /dev/stdin \
             --reference {input.reference} \
-            --genemap {input.genemap} \
-            --genes {params.genes} \
-            --sequences /dev/stdin \
+            --input-dataset {input.nextclade_dataset} \
+            --output-tsv {output.nextclade_qc} \
             --output-dir {params.outdir} \
             --output-basename {params.basename} \
             --output-fasta {output.alignment} \
-            --output-insertions {output.insertions} > {log} 2>&1
+            --output-insertions {output.insertions} 2>&1 | tee {log}
+        """
+
+rule join_metadata_and_nextclade_qc:
+    input:
+        metadata = "results/{build_name}/{build_name}_subsampled_metadata.tsv.xz",
+        nextclade_qc = "results/{build_name}/nextclade_qc.tsv",
+    output:
+        metadata = "results/{build_name}/metadata_with_nextclade_qc.tsv",
+    log:
+        "logs/join_metadata_and_nextclade_qc_{build_name}.txt",
+    benchmark:
+        "benchmarks/join_metadata_and_nextclade_qc_{build_name}.txt",
+    conda: config["conda_environment"]
+    shell:
+        """
+        python3 scripts/join-metadata-and-clades.py \
+            {input.metadata} \
+            {input.nextclade_qc} \
+            -o {output.metadata} 2>&1 | tee {log}
         """
 
 rule diagnostic:
     message: "Scanning metadata {input.metadata} for problematic sequences. Removing sequences with >{params.clock_filter} deviation from the clock and with more than {params.snp_clusters}."
     input:
-        metadata = "results/{build_name}/{build_name}_subsampled_metadata.tsv.xz",
+        metadata = "results/{build_name}/metadata_with_nextclade_qc.tsv",
     output:
         to_exclude = "results/{build_name}/excluded_by_diagnostics.txt"
     params:
@@ -596,7 +631,7 @@ rule index:
 
 rule annotate_metadata_with_index:
     input:
-        metadata="results/{build_name}/{build_name}_subsampled_metadata.tsv.xz",
+        metadata="results/{build_name}/metadata_with_nextclade_qc.tsv",
         sequence_index = "results/{build_name}/sequence_index.tsv",
     output:
         metadata="results/{build_name}/metadata_with_index.tsv",
@@ -716,7 +751,7 @@ rule adjust_metadata_regions:
         Adjusting metadata for build '{wildcards.build_name}'
         """
     input:
-        metadata="results/{build_name}/{build_name}_subsampled_metadata.tsv.xz",
+        metadata="results/{build_name}/metadata_with_index.tsv",
     output:
         metadata = "results/{build_name}/metadata_adjusted.tsv.xz"
     params: