Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Dev #7

Merged
merged 4 commits into from
Oct 25, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
43 changes: 43 additions & 0 deletions .github/workflows/stub.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
name: nf-core CI
# This workflow runs the pipeline with the minimal test dataset to check that it completes without any syntax errors
on:
push:
branches:
- dev
pull_request:
release:
types: [published]

env:
NXF_ANSI_LOG: false

concurrency:
group: "${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}"
cancel-in-progress: true

jobs:
test:
name: Run pipeline with test data
# Only run on push if this is the nf-core dev branch (merged PRs)
if: "${{ github.event_name != 'push' || (github.event_name == 'push' && github.repository == 'mk-kondo/mikrokondo') }}"
runs-on: ubuntu-latest
strategy:
matrix:
NXF_VER:
- "22.10.1"
- "latest-everything"
steps:
- name: Check out pipeline code
uses: actions/checkout@v3

- name: Install Nextflow
uses: nf-core/setup-nextflow@v1
with:
version: "${{ matrix.NXF_VER }}"

- name: Run pipeline with test data
# TODO nf-core: You can customise CI pipeline run tests as required
# For example: adding multiple test runs with different parameters
# Remember that you can parallelise this by using strategy.matrix
run: |
nextflow run ${GITHUB_WORKSPACE} -profile test_stub,docker -stub-run --outdir ./results
7 changes: 7 additions & 0 deletions .prettierignore
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,10 @@ testing/
testing*
*.pyc
bin/
docs/*
assets*
mkdocs.yml
nextflow_schema.json
README.md
fontlist-v330.json
CITATIONS.md
1 change: 1 addition & 0 deletions .prettierrc.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
printWidth: 120
43 changes: 20 additions & 23 deletions bin/GTDB_GCF_match.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,13 @@ class PasteyMcPasteFace:
Returns:
_type_: _description_
"""

...


class AssemblyPaths:
"""Take in a file of assemblies and parse out the assembly prefix
"""
"""Take in a file of assemblies and parse out the assembly prefix"""

_rs_gb_delimiter = "_"
_gcf_gca_delimiter = "_"
_search_prefix = "GCF"
Expand All @@ -37,28 +39,25 @@ def __init__(self, fp_assembly_path, fp_taxa_info):
self.filtered_output = self.remove_missing_taxa_info()
self.save_outputs()


def save_outputs(self):
"""write multiple output files for mash sketch
Mash seems to need labels passed to the CLI for each sequence to sketch so now the samples are being used to create cli commands
"""
id_data = open("mash_cli.txt", "w", encoding='utf8')
#path_data = open("mash_paths.txt", "w", encoding='utf8')
#taxa_data = open("mash_taxa.txt", "w", encoding='utf8')
id_data = open("mash_cli.txt", "w", encoding="utf8")
# path_data = open("mash_paths.txt", "w", encoding='utf8')
# taxa_data = open("mash_taxa.txt", "w", encoding='utf8')
for k, v in self.filtered_output.items():
id_data.write(f"\"{v[0]}\" -I '{k}' -C '{v[1]}'\n")
#path_data.write(f"{v[0]}\n")
#taxa_data.write(f"{v[1]}\n")
# path_data.write(f"{v[0]}\n")
# taxa_data.write(f"{v[1]}\n")

id_data.close()
#path_data.close()
#taxa_data.close()

# path_data.close()
# taxa_data.close()

def remove_missing_taxa_info(self):
"""remove dictionary entries with missing taxa info
"""
"""remove dictionary entries with missing taxa info"""
keys_rm = set()
for k, v in self.file_names.items():
if len(v) == 1:
Expand All @@ -69,15 +68,14 @@ def remove_missing_taxa_info(self):
new_dict = {k: v for k, v in self.file_names.items() if k not in keys_rm}
return new_dict


def merge_taxa_info(self):
"""merge the taxa data and the assembly paths
Returns:
_type_: _description_
"""
for i in self.taxa:
if self._search_prefix != i[0][0:self._search_pre_len]:
if self._search_prefix != i[0][0 : self._search_pre_len]:
continue
if self.file_names.get(i[0]):
self.file_names[i[0]].append(i[1])
Expand All @@ -94,14 +92,13 @@ def file_exist(file_in):
sys.exit(-1)

def taxon_info(self):
"""Parse assembly name and taxa info
"""
"""Parse assembly name and taxa info"""
out_list = []
with open(self.taxa_info, 'r', encoding='utf8') as taxa:
with open(self.taxa_info, "r", encoding="utf8") as taxa:
for i in taxa.readlines():
split_line = i.strip().split("\t")
# split at first under score as gtdb appends weather source is refseq or genbank
name = split_line[0][split_line[0].index(self._rs_gb_delimiter)+1:]
name = split_line[0][split_line[0].index(self._rs_gb_delimiter) + 1 :]
out_list.append((name, split_line[1]))

return out_list
Expand All @@ -110,11 +107,11 @@ def parse_assembly_name(self):
"""
read and parse all entries
"""
with open(self.fp, 'r', encoding='utf8') as file_in:
with open(self.fp, "r", encoding="utf8") as file_in:
# all file names are delimited from tail at second underscore
#lines = map(lambda x: (os.path.basename(x[:x.rindex("_")]), x.strip()), file_in.readlines())
#lines = [(os.path.basename(x[:x.rindex(self._gcf_gca_delimiter)]), x.strip()) for x in file_in.readlines()]
lines = {os.path.basename(x[:x.rindex(self._gcf_gca_delimiter)]): [x.strip()] for x in file_in.readlines()}
# lines = map(lambda x: (os.path.basename(x[:x.rindex("_")]), x.strip()), file_in.readlines())
# lines = [(os.path.basename(x[:x.rindex(self._gcf_gca_delimiter)]), x.strip()) for x in file_in.readlines()]
lines = {os.path.basename(x[: x.rindex(self._gcf_gca_delimiter)]): [x.strip()] for x in file_in.readlines()}
return lines


Expand Down
2 changes: 1 addition & 1 deletion bin/check_samplesheet.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,7 @@ def sniff_format(handle):
sniffer = csv.Sniffer()
if not sniffer.has_header(peek):
logger.critical("The given sample sheet does not appear to contain a header.")
#sys.exit(1)
# sys.exit(1)
dialect = sniffer.sniff(peek)
return dialect

Expand Down
38 changes: 17 additions & 21 deletions bin/create_summary_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,10 @@
import re
import sys


class JsonImport:
"""Intake json report to convert to CSV
"""
"""Intake json report to convert to CSV"""

__depth_limit = 10
__keep_keys = set(["meta", "QualityAnalysis", "QCSummary", "QCStatus"])
__delimiter = "\t"
Expand All @@ -29,10 +30,9 @@ def __init__(self, report_fp, output_name):
self.formatted_data = self.format_for_csv(self.normalized, self.rows)
self.to_file()


def to_file(self):
with open(self.output_name, "w") as out_file:
out_file.write(self.__delimiter) # first column is index
out_file.write(self.__delimiter) # first column is index
for i in self.formatted_data:
out_file.write(f"{i[0]}{self.__delimiter}")
out_file.write("\n")
Expand All @@ -44,7 +44,7 @@ def to_file(self):
out_file.write(f'"{val_write}"')
else:
out_file.write(val_write)
#out_file.write(str(ii[1][i]).replace('\n', ' \\'))
# out_file.write(str(ii[1][i]).replace('\n', ' \\'))
out_file.write(self.__delimiter)
out_file.write("\n")

Expand All @@ -53,7 +53,7 @@ def format_for_csv(self, results, rows):
data = []
for k, v in results.items():
n_row = copy.deepcopy(row)
sample_data = [key for key in v.keys() if key != "summary" ]
sample_data = [key for key in v.keys() if key != "summary"]
for item in v["summary"]:
n_row[item[0]] = item[1]
n_row_samp = copy.deepcopy(n_row)
Expand Down Expand Up @@ -119,15 +119,15 @@ def flatten_groups(self, data):
qc_status_rows.extend(list(qc_analysis_rows))
qc_status_rows.append("QCSummary")
qc_status_rows.extend(list(meta_data_rows))
#meta_data_rows = list(meta_data_rows)
#meta_data_rows.extend(list(qc_analysis_rows))
#meta_data_rows.append("QCSummary")
# meta_data_rows = list(meta_data_rows)
# meta_data_rows.extend(list(qc_analysis_rows))
# meta_data_rows.append("QCSummary")
rows = list(rows)
rows.sort(reverse=True)
qc_status_rows.extend(rows)
#meta_data_rows.extend(rows)
# meta_data_rows.extend(rows)

#return (sample_data_overview, meta_data_rows)
# return (sample_data_overview, meta_data_rows)
return (sample_data_overview, qc_status_rows)

def get_quality_analysis_fields(self, qc_fields):
Expand All @@ -139,23 +139,22 @@ def get_quality_analysis_fields(self, qc_fields):
fields.append((v["field"], v["message"]))
return fields


def recurse_json(self, dict_, prev_key, results):
if isinstance(dict_, dict):
for key in dict_:
if isinstance(dict_[key], dict):
self.recurse_json(dict_[key], prev_key=prev_key+"."+key, results=results)
self.recurse_json(dict_[key], prev_key=prev_key + "." + key, results=results)
elif isinstance(dict_[key], list):
for val in dict_[key]:
if isinstance(val, dict):
self.recurse_json(val, prev_key=prev_key+"."+key+"."+val, results=results)
self.recurse_json(val, prev_key=prev_key + "." + key + "." + val, results=results)
else:
results.append((prev_key + "." + key, dict_[key]))
else:
results.append((prev_key + "." + key, dict_[key]))
else:
if isinstance(dict_, str):
results.append((prev_key, dict_.replace("\"", "")))
results.append((prev_key, dict_.replace('"', "")))
elif isinstance(dict_, float):
results.append((prev_key, dict_))
elif isinstance(dict_, list):
Expand All @@ -169,7 +168,6 @@ def recurse_json(self, dict_, prev_key, results):
sys.stderr.write(f"Having issues with report JSON value {prev_key}. Data value {dict_}\n")
results.append((prev_key, dict_))


def regroup_data(self, paths):
"""Re-group data into a form that is easier to put into CSV format
Expand All @@ -189,8 +187,7 @@ def regroup_data(self, paths):
return sample_specific

def subset_paths(self):
"""Could be made faster by using the dictionary as input
"""
"""Could be made faster by using the dictionary as input"""
paths = []
for k in self.qc_paths:
sample = k[0][0]
Expand All @@ -200,7 +197,6 @@ def subset_paths(self):
paths.append(k)
return paths


def get_samples(self, dict_obj):
keys = dict_obj.keys()
return keys
Expand Down Expand Up @@ -243,7 +239,6 @@ def get_all_fields(self, dict_obj, samples):
path = [i]
self.recurse_dict(dict_obj[i], 0, info, path)


def normalize_overlapping_fields(self, normalized_dict):
"""Normalize overlapping json fields, e.g. if a sample is metagenomic and has species data
copied, add in the fastp data to each part
Expand All @@ -268,7 +263,7 @@ def normalize_overlapping_fields(self, normalized_dict):

def ingest_report(self, report_fp):
data = None
with open(report_fp, 'r', encoding='utf8') as report:
with open(report_fp, "r", encoding="utf8") as report:
data = json.load(report)
return data

Expand All @@ -284,6 +279,7 @@ def main_(args_in):
sys.stderr.write(f"{args.file_in} does not exist.\n")
sys.exit(-1)


if __name__ == "__main__":
# pass json file to program to parse it
main_(sys.argv[1:])
Loading
Loading