phac-nml · mattheww95 · Oct 25, 2023 · Oct 25, 2023 · Oct 25, 2023 · Oct 25, 2023
diff --git a/.github/workflows/stub.yml b/.github/workflows/stub.yml
@@ -0,0 +1,43 @@
+name: nf-core CI
+# This workflow runs the pipeline with the minimal test dataset to check that it completes without any syntax errors
+on:
+  push:
+    branches:
+      - dev
+  pull_request:
+  release:
+    types: [published]
+
+env:
+  NXF_ANSI_LOG: false
+
+concurrency:
+  group: "${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}"
+  cancel-in-progress: true
+
+jobs:
+  test:
+    name: Run pipeline with test data
+    # Only run on push if this is the nf-core dev branch (merged PRs)
+    if: "${{ github.event_name != 'push' || (github.event_name == 'push' && github.repository == 'mk-kondo/mikrokondo') }}"
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        NXF_VER:
+          - "22.10.1"
+          - "latest-everything"
+    steps:
+      - name: Check out pipeline code
+        uses: actions/checkout@v3
+
+      - name: Install Nextflow
+        uses: nf-core/setup-nextflow@v1
+        with:
+          version: "${{ matrix.NXF_VER }}"
+
+      - name: Run pipeline with test data
+        # TODO nf-core: You can customise CI pipeline run tests as required
+        # For example: adding multiple test runs with different parameters
+        # Remember that you can parallelise this by using strategy.matrix
+        run: |
+          nextflow run ${GITHUB_WORKSPACE} -profile test_stub,docker -stub-run --outdir ./results
diff --git a/.prettierignore b/.prettierignore
@@ -10,3 +10,10 @@ testing/
 testing*
 *.pyc
 bin/
+docs/*
+assets*
+mkdocs.yml
+nextflow_schema.json
+README.md
+fontlist-v330.json
+CITATIONS.md
diff --git a/.prettierrc.yml b/.prettierrc.yml
@@ -0,0 +1 @@
+printWidth: 120
diff --git a/bin/GTDB_GCF_match.py b/bin/GTDB_GCF_match.py
@@ -14,11 +14,13 @@ class PasteyMcPasteFace:
     Returns:
         _type_: _description_
     """
+
     ...
 
+
 class AssemblyPaths:
-    """Take in a file of assemblies and parse out the assembly prefix
-    """
+    """Take in a file of assemblies and parse out the assembly prefix"""
+
     _rs_gb_delimiter = "_"
     _gcf_gca_delimiter = "_"
     _search_prefix = "GCF"
@@ -37,28 +39,25 @@ def __init__(self, fp_assembly_path, fp_taxa_info):
         self.filtered_output = self.remove_missing_taxa_info()
         self.save_outputs()
 
-
     def save_outputs(self):
         """write multiple output files for mash sketch
 
         Mash seems to need labels passed to the CLI for each sequence to sketch so now the samples are being used to create cli commands
         """
-        id_data = open("mash_cli.txt", "w", encoding='utf8')
-        #path_data = open("mash_paths.txt", "w", encoding='utf8')
-        #taxa_data = open("mash_taxa.txt", "w", encoding='utf8')
+        id_data = open("mash_cli.txt", "w", encoding="utf8")
+        # path_data = open("mash_paths.txt", "w", encoding='utf8')
+        # taxa_data = open("mash_taxa.txt", "w", encoding='utf8')
         for k, v in self.filtered_output.items():
             id_data.write(f"\"{v[0]}\" -I '{k}' -C '{v[1]}'\n")
-            #path_data.write(f"{v[0]}\n")
-            #taxa_data.write(f"{v[1]}\n")
+            # path_data.write(f"{v[0]}\n")
+            # taxa_data.write(f"{v[1]}\n")
 
         id_data.close()
-        #path_data.close()
-        #taxa_data.close()
-
+        # path_data.close()
+        # taxa_data.close()
 
     def remove_missing_taxa_info(self):
-        """remove dictionary entries with missing taxa info
-        """
+        """remove dictionary entries with missing taxa info"""
         keys_rm = set()
         for k, v in self.file_names.items():
             if len(v) == 1:
@@ -69,15 +68,14 @@ def remove_missing_taxa_info(self):
         new_dict = {k: v for k, v in self.file_names.items() if k not in keys_rm}
         return new_dict
 
-
     def merge_taxa_info(self):
         """merge the taxa data and the assembly paths
 
         Returns:
             _type_: _description_
         """
         for i in self.taxa:
-            if self._search_prefix != i[0][0:self._search_pre_len]:
+            if self._search_prefix != i[0][0 : self._search_pre_len]:
                 continue
             if self.file_names.get(i[0]):
                 self.file_names[i[0]].append(i[1])
@@ -94,14 +92,13 @@ def file_exist(file_in):
             sys.exit(-1)
 
     def taxon_info(self):
-        """Parse assembly name and taxa info
-        """
+        """Parse assembly name and taxa info"""
         out_list = []
-        with open(self.taxa_info, 'r', encoding='utf8') as taxa:
+        with open(self.taxa_info, "r", encoding="utf8") as taxa:
             for i in taxa.readlines():
                 split_line = i.strip().split("\t")
                 # split at first under score as gtdb appends weather source is refseq or genbank
-                name = split_line[0][split_line[0].index(self._rs_gb_delimiter)+1:]
+                name = split_line[0][split_line[0].index(self._rs_gb_delimiter) + 1 :]
                 out_list.append((name, split_line[1]))
 
         return out_list
@@ -110,11 +107,11 @@ def parse_assembly_name(self):
         """
         read and parse all entries
         """
-        with open(self.fp, 'r', encoding='utf8') as file_in:
+        with open(self.fp, "r", encoding="utf8") as file_in:
             # all file names are delimited from tail at second underscore
-            #lines = map(lambda x: (os.path.basename(x[:x.rindex("_")]), x.strip()), file_in.readlines())
-            #lines = [(os.path.basename(x[:x.rindex(self._gcf_gca_delimiter)]), x.strip()) for x in  file_in.readlines()]
-            lines = {os.path.basename(x[:x.rindex(self._gcf_gca_delimiter)]): [x.strip()] for x in  file_in.readlines()}
+            # lines = map(lambda x: (os.path.basename(x[:x.rindex("_")]), x.strip()), file_in.readlines())
+            # lines = [(os.path.basename(x[:x.rindex(self._gcf_gca_delimiter)]), x.strip()) for x in  file_in.readlines()]
+            lines = {os.path.basename(x[: x.rindex(self._gcf_gca_delimiter)]): [x.strip()] for x in file_in.readlines()}
         return lines
 
 

diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py
@@ -160,7 +160,7 @@ def sniff_format(handle):
     sniffer = csv.Sniffer()
     if not sniffer.has_header(peek):
         logger.critical("The given sample sheet does not appear to contain a header.")
-        #sys.exit(1)
+        # sys.exit(1)
     dialect = sniffer.sniff(peek)
     return dialect
 

diff --git a/bin/create_summary_csv.py b/bin/create_summary_csv.py
@@ -13,9 +13,10 @@
 import re
 import sys
 
+
 class JsonImport:
-    """Intake json report to convert to CSV
-    """
+    """Intake json report to convert to CSV"""
+
     __depth_limit = 10
     __keep_keys = set(["meta", "QualityAnalysis", "QCSummary", "QCStatus"])
     __delimiter = "\t"
@@ -29,10 +30,9 @@ def __init__(self, report_fp, output_name):
         self.formatted_data = self.format_for_csv(self.normalized, self.rows)
         self.to_file()
 
-
     def to_file(self):
         with open(self.output_name, "w") as out_file:
-            out_file.write(self.__delimiter) # first column is index
+            out_file.write(self.__delimiter)  # first column is index
             for i in self.formatted_data:
                 out_file.write(f"{i[0]}{self.__delimiter}")
             out_file.write("\n")
@@ -44,7 +44,7 @@ def to_file(self):
                         out_file.write(f'"{val_write}"')
                     else:
                         out_file.write(val_write)
-                        #out_file.write(str(ii[1][i]).replace('\n', ' \\'))
+                        # out_file.write(str(ii[1][i]).replace('\n', ' \\'))
                     out_file.write(self.__delimiter)
                 out_file.write("\n")
 
@@ -53,7 +53,7 @@ def format_for_csv(self, results, rows):
         data = []
         for k, v in results.items():
             n_row = copy.deepcopy(row)
-            sample_data = [key for key in v.keys() if key != "summary" ]
+            sample_data = [key for key in v.keys() if key != "summary"]
             for item in v["summary"]:
                 n_row[item[0]] = item[1]
             n_row_samp = copy.deepcopy(n_row)
@@ -119,15 +119,15 @@ def flatten_groups(self, data):
         qc_status_rows.extend(list(qc_analysis_rows))
         qc_status_rows.append("QCSummary")
         qc_status_rows.extend(list(meta_data_rows))
-        #meta_data_rows = list(meta_data_rows)
-        #meta_data_rows.extend(list(qc_analysis_rows))
-        #meta_data_rows.append("QCSummary")
+        # meta_data_rows = list(meta_data_rows)
+        # meta_data_rows.extend(list(qc_analysis_rows))
+        # meta_data_rows.append("QCSummary")
         rows = list(rows)
         rows.sort(reverse=True)
         qc_status_rows.extend(rows)
-        #meta_data_rows.extend(rows)
+        # meta_data_rows.extend(rows)
 
-        #return (sample_data_overview, meta_data_rows)
+        # return (sample_data_overview, meta_data_rows)
         return (sample_data_overview, qc_status_rows)
 
     def get_quality_analysis_fields(self, qc_fields):
@@ -139,23 +139,22 @@ def get_quality_analysis_fields(self, qc_fields):
                 fields.append((v["field"], v["message"]))
         return fields
 
-
     def recurse_json(self, dict_, prev_key, results):
         if isinstance(dict_, dict):
             for key in dict_:
                 if isinstance(dict_[key], dict):
-                    self.recurse_json(dict_[key], prev_key=prev_key+"."+key, results=results)
+                    self.recurse_json(dict_[key], prev_key=prev_key + "." + key, results=results)
                 elif isinstance(dict_[key], list):
                     for val in dict_[key]:
                         if isinstance(val, dict):
-                            self.recurse_json(val, prev_key=prev_key+"."+key+"."+val, results=results)
+                            self.recurse_json(val, prev_key=prev_key + "." + key + "." + val, results=results)
                         else:
                             results.append((prev_key + "." + key, dict_[key]))
                 else:
                     results.append((prev_key + "." + key, dict_[key]))
         else:
             if isinstance(dict_, str):
-                results.append((prev_key, dict_.replace("\"", "")))
+                results.append((prev_key, dict_.replace('"', "")))
             elif isinstance(dict_, float):
                 results.append((prev_key, dict_))
             elif isinstance(dict_, list):
@@ -169,7 +168,6 @@ def recurse_json(self, dict_, prev_key, results):
                 sys.stderr.write(f"Having issues with report JSON value {prev_key}. Data value {dict_}\n")
                 results.append((prev_key, dict_))
 
-
     def regroup_data(self, paths):
         """Re-group data into a form that is easier to put into CSV format
 
@@ -189,8 +187,7 @@ def regroup_data(self, paths):
         return sample_specific
 
     def subset_paths(self):
-        """Could be made faster by using the dictionary as input
-        """
+        """Could be made faster by using the dictionary as input"""
         paths = []
         for k in self.qc_paths:
             sample = k[0][0]
@@ -200,7 +197,6 @@ def subset_paths(self):
                     paths.append(k)
         return paths
 
-
     def get_samples(self, dict_obj):
         keys = dict_obj.keys()
         return keys
@@ -243,7 +239,6 @@ def get_all_fields(self, dict_obj, samples):
             path = [i]
             self.recurse_dict(dict_obj[i], 0, info, path)
 
-
     def normalize_overlapping_fields(self, normalized_dict):
         """Normalize overlapping json fields, e.g. if a sample is metagenomic and has species data
         copied, add in the fastp data to each part
@@ -268,7 +263,7 @@ def normalize_overlapping_fields(self, normalized_dict):
 
     def ingest_report(self, report_fp):
         data = None
-        with open(report_fp, 'r', encoding='utf8') as report:
+        with open(report_fp, "r", encoding="utf8") as report:
             data = json.load(report)
         return data
 
@@ -284,6 +279,7 @@ def main_(args_in):
         sys.stderr.write(f"{args.file_in} does not exist.\n")
         sys.exit(-1)
 
+
 if __name__ == "__main__":
     # pass json file to program to parse it
     main_(sys.argv[1:])