Merge pull request #1075 from nextstrain/100k-open

Add 100k open samples
nextstrain · Jun 30, 2023 · 0f2c298 · 0f2c298
2 parents 53d1b8e + ffd7e8e
commit 0f2c298
Show file tree

Hide file tree

Showing 5 changed files with 96 additions and 16 deletions.
diff --git a/.github/workflows/rebuild-100k.yml b/.github/workflows/rebuild-100k.yml
@@ -17,7 +17,7 @@ jobs:
       with:
         python-version: "3.10"
 
-    - name: Launch build
+    - name: Launch GISAID build
       run: |
         set -x
 
@@ -31,27 +31,60 @@ jobs:
           --memory 31GiB \
           . \
             upload \
-            --configfile nextstrain_profiles/100k/config.yaml \
+            --configfile nextstrain_profiles/100k/config-gisaid.yaml \
             --config "${config[@]}" \
             --set-threads tree=8 \
-        |& tee build-launch.log
+        |& tee build-launch-gisaid.log
       env:
         AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
         AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
         SLACK_TOKEN: ${{ secrets.SLACK_TOKEN }}
 
+    - name: Launch open build
+      run: |
+        set -x
+
+        declare -a config
+        config+=(slack_token=$SLACK_TOKEN)
+
+        nextstrain build \
+          --aws-batch \
+          --detach \
+          --cpus 16 \
+          --memory 31GiB \
+          . \
+            upload \
+            --configfile nextstrain_profiles/100k/config-open.yaml \
+            --config "${config[@]}" \
+            --set-threads tree=8 \
+        |& tee build-launch-open.log
+      env:
+        AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
+        AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
+        SLACK_TOKEN: ${{ secrets.SLACK_TOKEN }}
+
+
     - name: Build info
       run: |
-        echo "--> 100k sample rebuilding on AWS"
+        echo "--> 100k samples for GISAID + Open data rebuilding (using separate AWS jobs)"
         echo
-        echo "--> When completed, the following 2 files will be updated:"
+        echo "--> When completed, the following files will be updated:"
+        echo "s3://nextstrain-data/files/ncov/open/100k/metadata.tsv.xz"
+        echo "s3://nextstrain-data/files/ncov/open/100k/sequences.fasta.xz"
         echo "s3://nextstrain-ncov-private/100k/metadata.tsv.xz"
         echo "s3://nextstrain-ncov-private/100k/sequences.fasta.xz"
         echo
-        echo "--> You can attach to this AWS job via:"
-        tail -n1 build-launch.log
+        echo "--> You can attach to the GISAID AWS job via:"
+        tail -n1 build-launch-gisaid.log
+        echo
+        echo "--> You can attach to the Open AWS job via:"
+        tail -n1 build-launch-open.log
+        echo
+        JOBID=$( tail -n1 build-launch-gisaid.log | sed -E 's/.+attach ([-a-f0-9]+).+/\1/' )
+        echo "--> View the GISAID job in the AWS console via"
+        echo "    https://console.aws.amazon.com/batch/home?region=us-east-1#jobs/detail/${JOBID}"
         echo
-        JOBID=$( tail -n1 build-launch.log | sed -E 's/.+attach ([-a-f0-9]+).+/\1/' )
-        echo "--> View this job in the AWS console via"
+        JOBID=$( tail -n1 build-launch-open.log | sed -E 's/.+attach ([-a-f0-9]+).+/\1/' )
+        echo "--> View the Open job in the AWS console via"
         echo "    https://console.aws.amazon.com/batch/home?region=us-east-1#jobs/detail/${JOBID}"
         echo
diff --git a/docs/src/reference/remote_inputs.rst b/docs/src/reference/remote_inputs.rst
@@ -45,6 +45,13 @@ Our GISAID and open profiles each define 7 builds (a Global build and one build
 -  ``{build_name}/{build_name}_tip-frequencies.json``
 -  ``{build_name}/{build_name}_root-sequence.json``
 
+100k Subsamples
+---------------
+
+We also produce a subsample of the entire open dataset of around 100,000 samples.
+This is particularly useful for development purposes or to run builds locally as the file sizes are typically around 10Mb (metadata) and 20Mb (sequences).
+The data is chosen by sampling 50,000 samples from the previous 12 months and 50,000 prior to that, and within each sample we group by year, month and country in an attempt at even sampling.
+
 --------------
 
 .. _remote-inputs-open-files:
@@ -71,6 +78,10 @@ Each regional build (``global``, ``africa``, ``asia``, ``europe``, ``north-ameri
 +-----------------------+-----------------------+------------------------------------------------------------------------------+
 |                       | aligned (xz)          | https://data.nextstrain.org/files/ncov/open/aligned.fasta.xz                 |
 +-----------------------+-----------------------+------------------------------------------------------------------------------+
+| 100k sample           | metadata              | https://data.nextstrain.org/files/ncov/open/100k/metadata.tsv.xz             |
++-----------------------+-----------------------+------------------------------------------------------------------------------+
+|                       | sequences             | https://data.nextstrain.org/files/ncov/open/100k/sequences.fasta.xz          |
++-----------------------+-----------------------+------------------------------------------------------------------------------+
 | Global sample         | metadata              | https://data.nextstrain.org/files/ncov/open/global/metadata.tsv.xz           |
 +-----------------------+-----------------------+------------------------------------------------------------------------------+
 |                       | sequences             | https://data.nextstrain.org/files/ncov/open/global/sequences.fasta.xz        |

diff --git a/nextstrain_profiles/100k/README.md b/nextstrain_profiles/100k/README.md
@@ -1,21 +1,27 @@
 ## Aim
 
 To build a representative 100k dataset which is available for testing / developing builds locally.
-This is intended to run weekly via a GitHub action (which triggers a job to be run on AWS).
-It will make two files available:
+This is intended to run weekly via a GitHub action (which triggers jobs to be run on AWS).
+It will upload these files:
 
+* `s3://nextstrain-data/files/ncov/open/100k/metadata.tsv.xz`
+* `s3://nextstrain-data/files/ncov/open/100k/sequences.fasta.xz`
 * `s3://nextstrain-ncov-private/100k/metadata.tsv.xz`
 * `s3://nextstrain-ncov-private/100k/sequences.fasta.xz`
 
 While this profile is not recommended to be run locally, you can see what rules would be run via:
 
 ```
-snakemake --cores 1 --configfile nextstrain_profiles/100k/config.yaml -npf upload --dag | dot -Tpdf > dag.pdf
+snakemake --cores 1 --configfile nextstrain_profiles/100k/config-gisaid.yaml -npf upload --dag | dot -Tpdf > dag-100k-gisaid.pdf
+snakemake --cores 1 --configfile nextstrain_profiles/100k/config-open.yaml -npf upload --dag | dot -Tpdf > dag-100k-open.pdf
 ```
 
-To run manually you can trigger the GitHub action or run the job locally via:
+To run manually you can trigger the GitHub action (recommended) or run the jobs locally via:
 ```
 nextstrain build --aws-batch --cpus 16 --memory 31GiB --detach . \
-  --configfile nextstrain_profiles/100k/config.yaml \
+  --configfile nextstrain_profiles/100k/config-gisaid.yaml \
+  -f upload
+nextstrain build --aws-batch --cpus 16 --memory 31GiB --detach . \
+  --configfile nextstrain_profiles/100k/config-open.yaml \
   -f upload
 ```
diff --git a/nextstrain_profiles/100k/config.yaml → nextstrain_profiles/100k/config-gisaid.yaml b/nextstrain_profiles/100k/config.yaml → nextstrain_profiles/100k/config-gisaid.yaml
@@ -9,8 +9,8 @@ custom_rules:
 # Note: unaligned sequences are provided as "aligned" sequences to avoid an initial full-DB alignment
 inputs:
   - name: gisaid
-    metadata: "s3://nextstrain-ncov-private/metadata.tsv.gz"
-    aligned: "s3://nextstrain-ncov-private/sequences.fasta.xz"
+    metadata: "s3://nextstrain-ncov-private/metadata.tsv.zst"
+    aligned: "s3://nextstrain-ncov-private/sequences.fasta.zst"
     skip_sanitize_metadata: true
 
 builds:

diff --git a/nextstrain_profiles/100k/config-open.yaml b/nextstrain_profiles/100k/config-open.yaml
@@ -0,0 +1,30 @@
+# This file is largely duplicated from `config-gisaid.yaml` - please
+# see that file for comments
+S3_DST_BUCKET: "nextstrain-data/files/ncov/open/100k" # TODO XXX
+S3_DST_ORIGINS: [needed-for-workflow-but-unused]
+deploy_url: needed_for_workflow_but_unused
+custom_rules:
+  - workflow/snakemake_rules/export_for_nextstrain.smk
+inputs:
+  - name: open
+    metadata: "s3://nextstrain-data/files/ncov/open/metadata.tsv.zst"
+    aligned: "s3://nextstrain-data/files/ncov/open/sequences.fasta.zst"
+    skip_sanitize_metadata: true
+builds:
+  100k:
+    subsampling_scheme: 100k_scheme
+upload:
+  metadata.tsv.xz: results/100k/100k_subsampled_metadata.tsv.xz
+  sequences.fasta.xz: results/100k/100k_subsampled_sequences.fasta.xz
+filter:
+  exclude_where: "division='USA'"
+subsampling:
+  100k_scheme:
+    50k_early:
+      group_by: "year month country"
+      max_sequences: 50000
+      max_date: "--max-date 1Y"
+    50k_late:
+      group_by: "year month country"
+      max_sequences: 50000
+      min_date: "--min-date 1Y"