From 652b7c8e8529065b9982bd27648eec282b5920b0 Mon Sep 17 00:00:00 2001 From: eriny Date: Tue, 9 Jul 2024 12:39:53 -0600 Subject: [PATCH 1/9] allows downloading of fastq files by analysis --- download_by_analysis.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100755 download_by_analysis.py diff --git a/download_by_analysis.py b/download_by_analysis.py new file mode 100755 index 0000000..e69de29 From 7d857d441ccd8b58253b4f49e2e0d90d30bea68f Mon Sep 17 00:00:00 2001 From: eriny Date: Tue, 9 Jul 2024 12:40:04 -0600 Subject: [PATCH 2/9] allows downloading of fastq files by run --- download_by_run.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) create mode 100755 download_by_run.py diff --git a/download_by_run.py b/download_by_run.py new file mode 100755 index 0000000..e69de29 From 6b474e472ea2b6252b4165f39bbde28c8d4054af Mon Sep 17 00:00:00 2001 From: eriny Date: Tue, 9 Jul 2024 12:40:26 -0600 Subject: [PATCH 3/9] grandeur results to google sheet formatting --- grandeur_aws_to_sheets.py | 372 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 372 insertions(+) create mode 100755 grandeur_aws_to_sheets.py diff --git a/grandeur_aws_to_sheets.py b/grandeur_aws_to_sheets.py new file mode 100755 index 0000000..ef2c43a --- /dev/null +++ b/grandeur_aws_to_sheets.py @@ -0,0 +1,372 @@ +#!/usr/bin/env python3 + +''' +Author: Erin Young + +Description: + +This script will collect needed files to get results from Grandeur for +1. The 'Finished' tab (will be discontinued in the future) +2. As a lookup table for the 'ARLN' tab + +EXAMPLE: +python3 grandeur_aws_to_sheets.py -g -s +''' + +# trying to keep dependencies really low +import argparse +import pandas as pd +import os +import logging +import glob + +# local files +from read_miseq_sample_sheet import read_miseq_sample_sheet + +logging.basicConfig(format='%(asctime)s - %(message)s', datefmt = '%y-%b-%d %H:%M:%S') + +def sample_sheet_to_df(samplesheet): + logging.info(f"Getting samples from {samplesheet}") + + if os.path.exists(samplesheet): + # get all the samples from the sample sheet into a pandas dataframe + df = read_miseq_sample_sheet(samplesheet, "sample_id") + df['lims_id'] = df['Sample_ID'].str.replace( '-UT.*','', regex=True) + df['wgs_id'] = df['Sample_Name'].str.replace('-UT.*','', regex=True) + + return df + + else: + logging.fatal('Sample sheet could not be located! (Specify with -s)') + exit(1) + +def amrfinder_results(df, args): + amrfinder_dir = args.grandeur + '/ncbi-AMRFinderplus/' + + if not os.path.isdir(amrfinder_dir): + return df + + logging.info('Getting virulence and AMR genes from ' + amrfinder_dir) + dfs = [] + for filename in os.listdir(amrfinder_dir): + if filename.endswith("_amrfinder_plus.txt"): + filepath = os.path.join(amrfinder_dir, filename) + ind_df = pd.read_csv(filepath) + if not ind_df.empty: + dfs.append(ind_df) + + if dfs: + amrfinder_df = pd.concat(dfs, ignore_index=True) + else: + return df + + if not amrfinder_df.empty: + amrfinder_df = amrfinder_df.sort_values('Gene symbol') + + amr_df = amrfinder_df[amrfinder_df['Element type'] == 'AMR'].copy() + amr_df = amr_df.groupby('Name', as_index=False).agg({'Gene symbol': lambda x: list(x)}) + amr_df['amr genes'] = amr_df['Gene symbol'] + df = pd.merge(df,amr_df[['Name','amr genes']],left_on='Sample_Name', right_on='Name', how='left') + df = df.drop('Name', axis=1) + + vir_df = amrfinder_df[amrfinder_df['Element type'] == 'VIRULENCE'].copy() + vir_df = vir_df.groupby('Name', as_index=False).agg({'Gene symbol': lambda x: list(x)}) + vir_df['virulence genes'] = vir_df['Gene symbol'] + df = pd.merge(df,vir_df[['Name','virulence genes']],left_on='Sample_Name', right_on='Name', how='left') + df = df.drop('Name', axis=1) + + return df + +def create_files(df): + # columns for final files + # For the results tab : top organism, serotypefinder/shigellatyper, seqsero2, coverage, warnings, blobtools, and kraken2 + finished_cols = ['wgs_id', 'Description', 'organism', 'SerotypeFinder (E. coli)', 'SeqSero Organism (Salmonella)', 'Sample_Project', 'coverage', 'Pass', 'warnings', 'blobtools_organism_(per_mapped_reads)'] + # ARLN needs 'WGS MLST', 'AMR genes', 'Virulence genes' + arln_cols = ['wgs_id', 'Description', 'organism', 'coverage', 'mlst', 'emm type', 'Sample_Project', 'amr genes', 'virulence genes', 'kraken2_organism_(per_fragment)', 'blobtools_organism_(per_mapped_reads)'] + + for col in finished_cols + arln_cols: + if col not in df.columns: + df[col] = None + + + df = df.fillna('') + df = df.sort_values('wgs_id') + + logging.info('Writing file for Finished tab') + df.to_csv('finished_tab.tsv', columns = finished_cols, index=False, sep = '\t' ) + df.to_csv('finished_tab.txt', columns = finished_cols, index=False, sep = ';' ) + logging.info('Created finished_tab.{txt,tsv}') + + logging.info('Writing file for ARLN tab') + df.to_csv('arln_tab.tsv', columns = arln_cols, index=False, sep = '\t' ) + df.to_csv('arln_tab.txt', columns = arln_cols, index=False, sep = ';' ) + logging.info('Created arln_tab.{txt,tsv}') + + +def emmtyper_results(df, args): + emmtyper_dir = args.grandeur + '/emmtyper/' + + if not os.path.isdir(emmtyper_dir): + return df + + logging.info('Getting emmtyper information from ' + emmtyper_dir) + + dfs = [] + + for filename in os.listdir(emmtyper_dir): + if filename.endswith("mlst.tsv"): + filepath = os.path.join(emmtyper_dir, filename) + ind_df = pd.read_table(filepath, sep="\t") + if not ind_df.empty: + dfs.append(ind_df) + + if dfs: + emmtyper_df = pd.concat(dfs, ignore_index=True) + else: + return df + + if not emmtyper_df.empty: + emmtyper_df['mlst'] = mlst_df['matching PubMLST scheme'] + ':' + mlst_df['ST'].astype('str') + df = pd.merge(df,emmtyper_df[['sample','mlst']],left_on='Sample_Name', right_on='sample', how='left') + df = df.drop('sample', axis=1) + + return df + + +def fastani_results(df, args): + fastani_dir = args.grandeur + '/fastani/' + + if not os.path.isdir(fastani_dir): + return df + + logging.info('Getting the top organism from fastani at ' + fastani_dir) + dfs = [] + for filename in os.listdir(fastani_dir): + if filename.endswith("_fastani.csv"): + filepath = os.path.join(fastani_dir, filename) + ind_df = pd.read_csv(filepath) + if not ind_df.empty: + dfs.append(ind_df) + + if dfs: + fastani_df = pd.concat(dfs, ignore_index=True) + else: + return df + + # Getting the WGS organism from fastani or mash + if not fastani_df.empty: + fastani_df = fastani_df[fastani_df['ANI estimate'] >= 0.9] + fastani_df = fastani_df.sort_values(by=['ANI estimate'], ascending = False) + fastani_df = fastani_df.drop_duplicates(subset=['sample'], keep = 'first') + fastani_df['organism'] = fastani_df['reference'].str.replace('_GC.*', '', regex = True) + + df = pd.merge(df,fastani_df[['sample','organism']],left_on='Sample_Name', right_on='sample', how='left') + df = df.drop('sample', axis=1) + + return df + + +def grandeur_summary(df, args): + summary = args.grandeur + '/grandeur_summary.tsv' + logging.info('Extracting information from ' + summary) + + # using results in summary file instead of the "original" + # blobtools = args.grandeur + '/blobtools/blobtools_summary.txt' + # kraken2 = args.grandeur + '/kraken2/kraken2_summary.csv' + # serotypefinder = args.grandeur + '/serotypefinder/serotypefinder_results.txt' + # shigatyper = args.grandeur + '/shigatyper/shigatyper_results.txt' + + # getting coverage + summary_df = pd.read_table(summary) + df = pd.merge(df,summary_df[['sample','coverage','warnings']],left_on='Sample_Name', right_on='sample', how='left') + df = df.drop('sample', axis=1) + df['coverage'] = df['coverage'].fillna(0) + df['coverage'] = df['coverage'].round(2) + + # getting O and H groups + if 'serotypefinder_Serotype_O' in summary_df.columns: + if 'shigatyper_Hit' in summary_df.columns and 'mash_organism' in summary_df.columns: + logging.info('Double checking E. coli organism with shigatyper results') + ecoli_df = summary_df[summary_df['mash_organism'].str.contains('Shigella', na=False) | summary_df['mash_organism'].str.contains('Escherichia', na=False) ].copy() + ecoli_df['serotypefinder_Serotype_O'] = ecoli_df['serotypefinder_Serotype_O'].fillna("none") + ecoli_df['serotypefinder_Serotype_H'] = ecoli_df['serotypefinder_Serotype_H'].fillna("none") + ecoli_df['ecoli_organism'] = "Escherichia coli" + ecoli_df.loc[ecoli_df['shigatyper_Hit'].str.contains('ipaH', na=False), 'ecoli_organism'] = 'Shigella' + ecoli_df['SerotypeFinder (E. coli)'] = ecoli_df['ecoli_organism'] + " " + ecoli_df['serotypefinder_Serotype_O'] + ":" + ecoli_df['serotypefinder_Serotype_H'] + + df = pd.merge(df,ecoli_df[['sample','SerotypeFinder (E. coli)']],left_on='Sample_Name', right_on='sample', how='left') + df = df.drop('sample', axis=1) + else: + summary_df['SerotypeFinder (E. coli)'] = summary_df['serotypefinder_Serotype_O'].apply(str) + ':' + summary_df['serotypefinder_Serotype_H'].apply(str) + + df = pd.merge(df,summary_df[['sample','SerotypeFinder (E. coli)']],left_on='Sample_Name', right_on='sample', how='left') + df = df.drop('sample', axis=1) + else: + df['SerotypeFinder (E. coli)'] = '' + + if 'kraken2_organism_(per_fragment)' in summary_df.columns: + df = kraken2_results(df, summary_df) + + if 'blobtools_organism_(per_mapped_reads)' in summary_df.columns: + df = blobtools_results(df, summary_df) + + return df + +def mash_results(df, args): + mash_dir = args.grandeur + '/mash/' + + if not os.path.isdir(mash_dir): + return df + + logging.info('Getting the top organism from mash at ' + mash_dir) + + dfs = [] + for filename in os.listdir(mash_dir): + if filename.endswith("summary.mash.csv"): + filepath = os.path.join(mash_dir, filename) + ind_df = pd.read_csv(filepath) + if not ind_df.empty: + dfs.append(ind_df) + + if dfs: + mash_df = pd.concat(dfs, ignore_index=True) + else: + return df + + if not mash_df.empty: + mash_df = mash_df.sort_values(by=['P-value', 'mash-distance']) + mash_df = mash_df.drop_duplicates(subset=['sample'], keep = 'first') + + df = pd.merge(df,mash_df[['sample','organism']],left_on='Sample_Name', right_on='sample', how='left') + df = df.drop('sample', axis=1) + + if 'organism_x' in df.keys(): + df['organism'] = None + df['organism'] = df ['organism_x'].combine_first(df['organism']) + df['organism'] = df ['organism'].combine_first(df['organism_y']) + df = df.drop('organism_x', axis=1) + df = df.drop('organism_y', axis=1) + + return df + +def mlst_results(df, args): + mlst_dir = args.grandeur + '/mlst/' + + if not os.path.isdir(mlst_dir): + return df + + logging.info('Getting MLST information from ' + mlst_dir) + + dfs = [] + for filename in os.listdir(mlst_dir): + if filename.endswith("mlst.tsv"): + filepath = os.path.join(mlst_dir, filename) + ind_df = pd.read_table(filepath, sep="\t") + if not ind_df.empty: + dfs.append(ind_df) + + if dfs: + mlst_df = pd.concat(dfs, ignore_index=True) + else: + return df + + if not mlst_df.empty: + mlst_df['mlst'] = mlst_df['matching PubMLST scheme'] + ':' + mlst_df['ST'].astype('str') + df = pd.merge(df,mlst_df[['sample','mlst']],left_on='Sample_Name', right_on='sample', how='left') + df = df.drop('sample', axis=1) + + return df + + +def seqsero2_results(df, args): + seqsero2_dir = args.grandeur + '/seqsero2/' + + if not os.path.isdir(seqsero2_dir): + return df + + logging.info('Getting salmonella serotype information from ' + seqsero2_dir) + + dfs = [] + files = glob.glob(args.grandeur + '/seqsero2/*/SeqSero_result.tsv') + for file in files: + ind_df = pd.read_table(file, sep='\t') + if not ind_df.empty: + dfs.append(ind_df) + + if dfs: + seqsero2_df = pd.concat(dfs, ignore_index=True) + else: + return df + + if not seqsero2_df.empty: + seqsero2_df['SeqSero Organism (Salmonella)'] = 'Salmonella enterica serovar ' + seqsero2_df['Predicted serotype'] + ':' + seqsero2_df['Predicted antigenic profile'] + + df = pd.merge(df,seqsero2_df[['sample','SeqSero Organism (Salmonella)']],left_on='Sample_Name', right_on='sample', how='left') + df = df.drop('sample', axis=1) + else: + df['SeqSero Organism (Salmonella)'] = '' + + return df + +def kraken2_results(df, summary_df): + logging.info("Getting kraken2 results") + df = pd.merge(df,summary_df[['sample','kraken2_organism_(per_fragment)']],left_on='Sample_Name', right_on='sample', how='left') + df = df.drop('sample', axis=1) + + return df + +def blobtools_results(df, summary_df): + logging.info("Getting blobtools results") + df = pd.merge(df,summary_df[['sample','blobtools_organism_(per_mapped_reads)']],left_on='Sample_Name', right_on='sample', how='left') + df = df.drop('sample', axis=1) + + return df + +def pass_fail(df): + df['Pass'] = 'TBD' + df.loc[df['coverage'] >= 40, 'Pass'] = 'Y' + df.loc[df['coverage'] < 20, 'Pass'] = 'X' + df.loc[(df['organism'].str.contains('Shigella', na=False)) & (df['coverage'] >= 40), 'Pass'] = 'Y' + df.loc[(df['organism'].str.contains('Shigella', na=False)) & (df['coverage'] < 40), 'Pass'] = 'X' + df.loc[(df['organism'].str.contains('Escherichia', na=False)) & (df['coverage'] >= 40), 'Pass'] = 'Y' + df.loc[(df['organism'].str.contains('Escherichia', na=False)) & (df['coverage'] < 40), 'Pass'] = 'X' + df.loc[(df['organism'].str.contains('Salmonella', na=False)) & (df['coverage'] >= 30), 'Pass'] = 'Y' + df.loc[(df['organism'].str.contains('Salmonella', na=False)) & (df['coverage'] < 30), 'Pass'] = 'X' + df.loc[(df['organism'].str.contains('Campylobacter', na=False)) & (df['coverage'] >= 20), 'Pass'] = 'Y' + df.loc[(df['organism'].str.contains('Campylobacter', na=False)) & (df['coverage'] < 20), 'Pass'] = 'X' + + df = df.sort_values('wgs_id') + df['organism'] = df['organism'].str.replace('_',' ',regex=False) + + return df + +def main(): + version = '0.1.24191' + + parser = argparse.ArgumentParser() + parser.add_argument('-g', '--grandeur', type=str, help='directory where Grandeur has output results', required=True) + parser.add_argument('-s', '--samplesheet', type=str, help='sample sheet for run', required=True) + parser.add_argument('-v', '--version', help='print version and exit', action='version', version='%(prog)s ' + version) + args = parser.parse_args() + + df = sample_sheet_to_df(args.samplesheet) + + df = grandeur_summary(df, args) + + df = fastani_results(df, args) + + df = mash_results(df, args) + + df = seqsero2_results(df, args) + + df = mlst_results(df, args) + + df = emmtyper_results(df, args) + + df = pass_fail(df) + + create_files(df) + + +if __name__ == "__main__": + main() \ No newline at end of file From 562b9355bbbbeaeaf2550fbe2f45a58ea79b6f69 Mon Sep 17 00:00:00 2001 From: eriny Date: Tue, 9 Jul 2024 12:40:41 -0600 Subject: [PATCH 4/9] separate functions for reading sample sheets --- read_miseq_sample_sheet.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) create mode 100755 read_miseq_sample_sheet.py diff --git a/read_miseq_sample_sheet.py b/read_miseq_sample_sheet.py new file mode 100755 index 0000000..64baa25 --- /dev/null +++ b/read_miseq_sample_sheet.py @@ -0,0 +1,18 @@ +import pandas as pd +import logging + +def find_header_line(file_path, header_signal): + with open(file_path, 'r') as file: + for line_number, line in enumerate(file): + if header_signal in line.lower(): + logging.debug(f"The line number in {file_path} was {line_number}") + return line_number + raise ValueError("Header line not found in file") + +def read_miseq_sample_sheet(sample_sheet, header_signal): + # Find the number of lines to skip because it's sometimes variable + lines_to_skip = find_header_line(sample_sheet, header_signal) + + # Read the CSV file into a DataFrame, skipping the appropriate number of lines + df = pd.read_csv(sample_sheet, skiprows=lines_to_skip) + return df From 34d873e003aaa5ec725228f72811382d2df294fc Mon Sep 17 00:00:00 2001 From: eriny Date: Thu, 11 Jul 2024 13:06:08 -0600 Subject: [PATCH 5/9] adjusted to functions --- grandeur_aws_to_sheets.py | 306 +++++++++++++++++--------------------- 1 file changed, 138 insertions(+), 168 deletions(-) diff --git a/grandeur_aws_to_sheets.py b/grandeur_aws_to_sheets.py index ef2c43a..3fd8a40 100755 --- a/grandeur_aws_to_sheets.py +++ b/grandeur_aws_to_sheets.py @@ -25,58 +25,36 @@ logging.basicConfig(format='%(asctime)s - %(message)s', datefmt = '%y-%b-%d %H:%M:%S') -def sample_sheet_to_df(samplesheet): - logging.info(f"Getting samples from {samplesheet}") - - if os.path.exists(samplesheet): - # get all the samples from the sample sheet into a pandas dataframe - df = read_miseq_sample_sheet(samplesheet, "sample_id") - df['lims_id'] = df['Sample_ID'].str.replace( '-UT.*','', regex=True) - df['wgs_id'] = df['Sample_Name'].str.replace('-UT.*','', regex=True) - - return df - - else: - logging.fatal('Sample sheet could not be located! (Specify with -s)') - exit(1) def amrfinder_results(df, args): - amrfinder_dir = args.grandeur + '/ncbi-AMRFinderplus/' - - if not os.path.isdir(amrfinder_dir): - return df - - logging.info('Getting virulence and AMR genes from ' + amrfinder_dir) - dfs = [] - for filename in os.listdir(amrfinder_dir): - if filename.endswith("_amrfinder_plus.txt"): - filepath = os.path.join(amrfinder_dir, filename) - ind_df = pd.read_csv(filepath) - if not ind_df.empty: - dfs.append(ind_df) - - if dfs: - amrfinder_df = pd.concat(dfs, ignore_index=True) - else: - return df + amrfinder_df = results_to_df(f"{args.grandeur}/ncbi-AMRFinderplus/", "\t", "_amrfinder_plus.txt") if not amrfinder_df.empty: amrfinder_df = amrfinder_df.sort_values('Gene symbol') amr_df = amrfinder_df[amrfinder_df['Element type'] == 'AMR'].copy() - amr_df = amr_df.groupby('Name', as_index=False).agg({'Gene symbol': lambda x: list(x)}) + amr_df = amr_df.groupby('Name', as_index=False).agg({'Gene symbol': lambda x: ', '.join(list(x))}) amr_df['amr genes'] = amr_df['Gene symbol'] df = pd.merge(df,amr_df[['Name','amr genes']],left_on='Sample_Name', right_on='Name', how='left') df = df.drop('Name', axis=1) vir_df = amrfinder_df[amrfinder_df['Element type'] == 'VIRULENCE'].copy() - vir_df = vir_df.groupby('Name', as_index=False).agg({'Gene symbol': lambda x: list(x)}) + vir_df = vir_df.groupby('Name', as_index=False).agg({'Gene symbol': lambda x: ', '.join(list(x))}) vir_df['virulence genes'] = vir_df['Gene symbol'] df = pd.merge(df,vir_df[['Name','virulence genes']],left_on='Sample_Name', right_on='Name', how='left') df = df.drop('Name', axis=1) return df + +def blobtools_results(df, summary_df): + logging.info("Getting blobtools results") + df = pd.merge(df,summary_df[['sample','blobtools_organism_(per_mapped_reads)']],left_on='Sample_Name', right_on='sample', how='left') + df = df.drop('sample', axis=1) + + return df + + def create_files(df): # columns for final files # For the results tab : top organism, serotypefinder/shigellatyper, seqsero2, coverage, warnings, blobtools, and kraken2 @@ -88,7 +66,6 @@ def create_files(df): if col not in df.columns: df[col] = None - df = df.fillna('') df = df.sort_values('wgs_id') @@ -104,54 +81,49 @@ def create_files(df): def emmtyper_results(df, args): - emmtyper_dir = args.grandeur + '/emmtyper/' + emmtyper_df = results_to_df(f"{args.grandeur}/emmtyper/", "\t", "_emmtyper.txt") - if not os.path.isdir(emmtyper_dir): - return df - - logging.info('Getting emmtyper information from ' + emmtyper_dir) - - dfs = [] + if not emmtyper_df.empty: + emmtyper_df['emm type'] = emmtyper_df['Predicted emm-type'] + df = pd.merge(df,emmtyper_df[['sample','emm type']],left_on='Sample_Name', right_on='sample', how='left') + df = df.drop('sample', axis=1) - for filename in os.listdir(emmtyper_dir): - if filename.endswith("mlst.tsv"): - filepath = os.path.join(emmtyper_dir, filename) - ind_df = pd.read_table(filepath, sep="\t") - if not ind_df.empty: - dfs.append(ind_df) + return df - if dfs: - emmtyper_df = pd.concat(dfs, ignore_index=True) - else: - return df - if not emmtyper_df.empty: - emmtyper_df['mlst'] = mlst_df['matching PubMLST scheme'] + ':' + mlst_df['ST'].astype('str') - df = pd.merge(df,emmtyper_df[['sample','mlst']],left_on='Sample_Name', right_on='sample', how='left') - df = df.drop('sample', axis=1) +def escherichia_serotype(df, summary_df, args): + logging.info('Double checking Escherichia organism with shigatyper results') + + # creating a copy of the summary_df to just the Escherichia samples + ecoli_df = summary_df[summary_df['mash_organism'].str.contains('Shigella', na=False) | summary_df['mash_organism'].str.contains('Escherichia', na=False) ].copy() + ecoli_df['serotypefinder_Serotype_O'] = ecoli_df['serotypefinder_Serotype_O'].fillna("none") + ecoli_df['serotypefinder_Serotype_H'] = ecoli_df['serotypefinder_Serotype_H'].fillna("none") + ecoli_df['O_H'] = ecoli_df['serotypefinder_Serotype_O'].astype(str) + ':' + ecoli_df['serotypefinder_Serotype_H'].astype(str) + ecoli_df['ecoli_organism'] = 'Escherichia coli' + + # making sure the top Shigella organism makes it to the spreadsheet for each ipaH+ sample + if 'shigatyper_hit' in summary_df.columns and 'mash_organism' in summary_df.columns: + shigella_df = ecoli_df[ecoli_df['shigatyper_hit'].str.contains('ipaH')] + for sample in shigella_df['sample'].tolist(): + organism = 'Escherichia coli' + with open(f"{args.grandeur}/mash/{sample}.summary.mash.csv") as file: + for line in file: + line = line.strip() + if 'Shigella' in line: + organism = line.split(',')[-1].replace('_', ' ') + break + ecoli_df.loc[ecoli_df['sample'] == sample, 'ecoli_organism'] = organism + + ecoli_df['SerotypeFinder (E. coli)'] = ecoli_df['ecoli_organism'] + " " + ecoli_df['O_H'] + + df = pd.merge(df, ecoli_df[['sample','SerotypeFinder (E. coli)']],left_on='Sample_Name', right_on='sample', how='left') + df = df.drop('sample', axis=1) return df def fastani_results(df, args): - fastani_dir = args.grandeur + '/fastani/' - - if not os.path.isdir(fastani_dir): - return df - - logging.info('Getting the top organism from fastani at ' + fastani_dir) - dfs = [] - for filename in os.listdir(fastani_dir): - if filename.endswith("_fastani.csv"): - filepath = os.path.join(fastani_dir, filename) - ind_df = pd.read_csv(filepath) - if not ind_df.empty: - dfs.append(ind_df) - - if dfs: - fastani_df = pd.concat(dfs, ignore_index=True) - else: - return df + fastani_df = results_to_df(f"{args.grandeur}/fastani/", ',', '_fastani.csv') # Getting the WGS organism from fastani or mash if not fastani_df.empty: @@ -167,7 +139,7 @@ def fastani_results(df, args): def grandeur_summary(df, args): - summary = args.grandeur + '/grandeur_summary.tsv' + summary = args.grandeur + '/grandeur_summary.tsv' logging.info('Extracting information from ' + summary) # using results in summary file instead of the "original" @@ -183,26 +155,8 @@ def grandeur_summary(df, args): df['coverage'] = df['coverage'].fillna(0) df['coverage'] = df['coverage'].round(2) - # getting O and H groups if 'serotypefinder_Serotype_O' in summary_df.columns: - if 'shigatyper_Hit' in summary_df.columns and 'mash_organism' in summary_df.columns: - logging.info('Double checking E. coli organism with shigatyper results') - ecoli_df = summary_df[summary_df['mash_organism'].str.contains('Shigella', na=False) | summary_df['mash_organism'].str.contains('Escherichia', na=False) ].copy() - ecoli_df['serotypefinder_Serotype_O'] = ecoli_df['serotypefinder_Serotype_O'].fillna("none") - ecoli_df['serotypefinder_Serotype_H'] = ecoli_df['serotypefinder_Serotype_H'].fillna("none") - ecoli_df['ecoli_organism'] = "Escherichia coli" - ecoli_df.loc[ecoli_df['shigatyper_Hit'].str.contains('ipaH', na=False), 'ecoli_organism'] = 'Shigella' - ecoli_df['SerotypeFinder (E. coli)'] = ecoli_df['ecoli_organism'] + " " + ecoli_df['serotypefinder_Serotype_O'] + ":" + ecoli_df['serotypefinder_Serotype_H'] - - df = pd.merge(df,ecoli_df[['sample','SerotypeFinder (E. coli)']],left_on='Sample_Name', right_on='sample', how='left') - df = df.drop('sample', axis=1) - else: - summary_df['SerotypeFinder (E. coli)'] = summary_df['serotypefinder_Serotype_O'].apply(str) + ':' + summary_df['serotypefinder_Serotype_H'].apply(str) - - df = pd.merge(df,summary_df[['sample','SerotypeFinder (E. coli)']],left_on='Sample_Name', right_on='sample', how='left') - df = df.drop('sample', axis=1) - else: - df['SerotypeFinder (E. coli)'] = '' + df = escherichia_serotype(df, summary_df, args) if 'kraken2_organism_(per_fragment)' in summary_df.columns: df = kraken2_results(df, summary_df) @@ -212,26 +166,17 @@ def grandeur_summary(df, args): return df -def mash_results(df, args): - mash_dir = args.grandeur + '/mash/' - - if not os.path.isdir(mash_dir): - return df - - logging.info('Getting the top organism from mash at ' + mash_dir) - dfs = [] - for filename in os.listdir(mash_dir): - if filename.endswith("summary.mash.csv"): - filepath = os.path.join(mash_dir, filename) - ind_df = pd.read_csv(filepath) - if not ind_df.empty: - dfs.append(ind_df) +def kraken2_results(df, summary_df): + logging.info("Getting kraken2 results") + df = pd.merge(df,summary_df[['sample','kraken2_organism_(per_fragment)']],left_on='Sample_Name', right_on='sample', how='left') + df = df.drop('sample', axis=1) - if dfs: - mash_df = pd.concat(dfs, ignore_index=True) - else: - return df + return df + + +def mash_results(df, args): + mash_df = results_to_df(f"{args.grandeur}/mash/", ",", "summary.mash.csv") if not mash_df.empty: mash_df = mash_df.sort_values(by=['P-value', 'mash-distance']) @@ -249,33 +194,88 @@ def mash_results(df, args): return df + def mlst_results(df, args): - mlst_dir = args.grandeur + '/mlst/' - - if not os.path.isdir(mlst_dir): - return df + mlst_df = results_to_df(f"{args.grandeur}/mlst/", "\t", "mlst.tsv") + + if not mlst_df.empty: + mlst_df['mlst'] = mlst_df['matching PubMLST scheme'] + ':' + mlst_df['ST'].astype('str') + df = pd.merge(df,mlst_df[['sample','mlst']],left_on='Sample_Name', right_on='sample', how='left') + df = df.drop('sample', axis=1) + + return df + + +def pass_fail(df): + df['Pass'] = 'TBD' + df.loc[df['coverage'] >= 40, 'Pass'] = 'Y' + df.loc[df['coverage'] < 20, 'Pass'] = 'X' + + organisms = [ + 'Acinetobacter', + 'Citrobacter', + 'Elizabethkingia', + 'Enterobacter', + 'Escherichia', + 'Klebsiella', + 'Listeria', + 'Neisseria', + 'Providencia', + 'Pseudomonas', + 'Ralstonia', + 'Serratia', + 'Shigella', + 'Streptococcus' ] + + for organism in organisms: + df.loc[(df['organism'].str.contains(organism, na=False)) & (df['coverage'] >= 40), 'Pass'] = 'Y' + df.loc[(df['organism'].str.contains(organism, na=False)) & (df['coverage'] < 40), 'Pass'] = 'X' + + df.loc[(df['organism'].str.contains('Salmonella', na=False)) & (df['coverage'] >= 30), 'Pass'] = 'Y' + df.loc[(df['organism'].str.contains('Salmonella', na=False)) & (df['coverage'] < 30), 'Pass'] = 'X' + df.loc[(df['organism'].str.contains('Campylobacter', na=False)) & (df['coverage'] >= 20), 'Pass'] = 'Y' + df.loc[(df['organism'].str.contains('Campylobacter', na=False)) & (df['coverage'] < 20), 'Pass'] = 'X' + + df = df.sort_values('wgs_id') + df['organism'] = df['organism'].str.replace('_',' ',regex=False) + + return df + + +def results_to_df(path, delim, end): + if not os.path.isdir(path): + return pd.DataFrame() - logging.info('Getting MLST information from ' + mlst_dir) + logging.info(f"Getting information from {path}") dfs = [] - for filename in os.listdir(mlst_dir): - if filename.endswith("mlst.tsv"): - filepath = os.path.join(mlst_dir, filename) - ind_df = pd.read_table(filepath, sep="\t") + for filename in os.listdir(path): + if filename.endswith(end): + filepath = os.path.join(path, filename) + ind_df = pd.read_table(filepath, sep=delim) if not ind_df.empty: dfs.append(ind_df) if dfs: - mlst_df = pd.concat(dfs, ignore_index=True) + return pd.concat(dfs, ignore_index=True) else: - return df + return pd.DataFrame() - if not mlst_df.empty: - mlst_df['mlst'] = mlst_df['matching PubMLST scheme'] + ':' + mlst_df['ST'].astype('str') - df = pd.merge(df,mlst_df[['sample','mlst']],left_on='Sample_Name', right_on='sample', how='left') - df = df.drop('sample', axis=1) - return df +def sample_sheet_to_df(samplesheet): + logging.info(f"Getting samples from {samplesheet}") + + if os.path.exists(samplesheet): + # get all the samples from the sample sheet into a pandas dataframe + df = read_miseq_sample_sheet(samplesheet, "sample_id") + df['lims_id'] = df['Sample_ID'].str.replace( '-UT.*','', regex=True) + df['wgs_id'] = df['Sample_Name'].str.replace('-UT.*','', regex=True) + + return df + + else: + logging.fatal('Sample sheet could not be located! (Specify with -s)') + exit(1) def seqsero2_results(df, args): @@ -308,38 +308,6 @@ def seqsero2_results(df, args): return df -def kraken2_results(df, summary_df): - logging.info("Getting kraken2 results") - df = pd.merge(df,summary_df[['sample','kraken2_organism_(per_fragment)']],left_on='Sample_Name', right_on='sample', how='left') - df = df.drop('sample', axis=1) - - return df - -def blobtools_results(df, summary_df): - logging.info("Getting blobtools results") - df = pd.merge(df,summary_df[['sample','blobtools_organism_(per_mapped_reads)']],left_on='Sample_Name', right_on='sample', how='left') - df = df.drop('sample', axis=1) - - return df - -def pass_fail(df): - df['Pass'] = 'TBD' - df.loc[df['coverage'] >= 40, 'Pass'] = 'Y' - df.loc[df['coverage'] < 20, 'Pass'] = 'X' - df.loc[(df['organism'].str.contains('Shigella', na=False)) & (df['coverage'] >= 40), 'Pass'] = 'Y' - df.loc[(df['organism'].str.contains('Shigella', na=False)) & (df['coverage'] < 40), 'Pass'] = 'X' - df.loc[(df['organism'].str.contains('Escherichia', na=False)) & (df['coverage'] >= 40), 'Pass'] = 'Y' - df.loc[(df['organism'].str.contains('Escherichia', na=False)) & (df['coverage'] < 40), 'Pass'] = 'X' - df.loc[(df['organism'].str.contains('Salmonella', na=False)) & (df['coverage'] >= 30), 'Pass'] = 'Y' - df.loc[(df['organism'].str.contains('Salmonella', na=False)) & (df['coverage'] < 30), 'Pass'] = 'X' - df.loc[(df['organism'].str.contains('Campylobacter', na=False)) & (df['coverage'] >= 20), 'Pass'] = 'Y' - df.loc[(df['organism'].str.contains('Campylobacter', na=False)) & (df['coverage'] < 20), 'Pass'] = 'X' - - df = df.sort_values('wgs_id') - df['organism'] = df['organism'].str.replace('_',' ',regex=False) - - return df - def main(): version = '0.1.24191' @@ -351,21 +319,23 @@ def main(): df = sample_sheet_to_df(args.samplesheet) - df = grandeur_summary(df, args) + df = grandeur_summary( df, args) + + df = fastani_results( df, args) - df = fastani_results(df, args) + df = mash_results( df, args) - df = mash_results(df, args) + df = seqsero2_results( df, args) - df = seqsero2_results(df, args) + df = mlst_results( df, args) - df = mlst_results(df, args) + df = emmtyper_results( df, args) - df = emmtyper_results(df, args) + df = amrfinder_results( df, args) - df = pass_fail(df) + df = pass_fail( df) - create_files(df) + create_files( df) if __name__ == "__main__": From 01a3aa5d7d3467084011f5e8f7de7295e24e5184 Mon Sep 17 00:00:00 2001 From: eriny Date: Fri, 12 Jul 2024 12:19:40 -0600 Subject: [PATCH 6/9] add docstrings --- download_by_analysis.py | 0 download_by_run.py | 0 ..._aws_to_sheets.py => grandeur_to_sheets.py | 220 +++++++++++++++++- 3 files changed, 209 insertions(+), 11 deletions(-) delete mode 100755 download_by_analysis.py delete mode 100755 download_by_run.py rename grandeur_aws_to_sheets.py => grandeur_to_sheets.py (71%) diff --git a/download_by_analysis.py b/download_by_analysis.py deleted file mode 100755 index e69de29..0000000 diff --git a/download_by_run.py b/download_by_run.py deleted file mode 100755 index e69de29..0000000 diff --git a/grandeur_aws_to_sheets.py b/grandeur_to_sheets.py similarity index 71% rename from grandeur_aws_to_sheets.py rename to grandeur_to_sheets.py index 3fd8a40..f07ce8c 100755 --- a/grandeur_aws_to_sheets.py +++ b/grandeur_to_sheets.py @@ -9,8 +9,10 @@ 1. The 'Finished' tab (will be discontinued in the future) 2. As a lookup table for the 'ARLN' tab +It's really meant to be copied and pasted into corresponding google sheets. + EXAMPLE: -python3 grandeur_aws_to_sheets.py -g -s +python3 grandeur_to_sheets.py -g -s ''' # trying to keep dependencies really low @@ -23,21 +25,36 @@ # local files from read_miseq_sample_sheet import read_miseq_sample_sheet -logging.basicConfig(format='%(asctime)s - %(message)s', datefmt = '%y-%b-%d %H:%M:%S') + def amrfinder_results(df, args): + """ + + Parses amrfinder output + + Args: + df (pd.Dataframe): dataframe for results thus far. + args (argparse.Namespace): Parsed command-line arguments. + + Returns: + df (pd.Dataframe): Pandas dataframe of the parsed output. + + """ + amrfinder_df = results_to_df(f"{args.grandeur}/ncbi-AMRFinderplus/", "\t", "_amrfinder_plus.txt") if not amrfinder_df.empty: amrfinder_df = amrfinder_df.sort_values('Gene symbol') + # amr results amr_df = amrfinder_df[amrfinder_df['Element type'] == 'AMR'].copy() amr_df = amr_df.groupby('Name', as_index=False).agg({'Gene symbol': lambda x: ', '.join(list(x))}) amr_df['amr genes'] = amr_df['Gene symbol'] df = pd.merge(df,amr_df[['Name','amr genes']],left_on='Sample_Name', right_on='Name', how='left') df = df.drop('Name', axis=1) + # virulence results vir_df = amrfinder_df[amrfinder_df['Element type'] == 'VIRULENCE'].copy() vir_df = vir_df.groupby('Name', as_index=False).agg({'Gene symbol': lambda x: ', '.join(list(x))}) vir_df['virulence genes'] = vir_df['Gene symbol'] @@ -48,6 +65,19 @@ def amrfinder_results(df, args): def blobtools_results(df, summary_df): + """ + + Parses blobtools output + + Args: + df (pd.Dataframe): dataframe for results thus far. + summary_df (pd.Dataframe): dataframe of grandeur results. + + Returns: + df (pd.Dataframe): Pandas dataframe of the parsed output. + + """ + logging.info("Getting blobtools results") df = pd.merge(df,summary_df[['sample','blobtools_organism_(per_mapped_reads)']],left_on='Sample_Name', right_on='sample', how='left') df = df.drop('sample', axis=1) @@ -56,6 +86,18 @@ def blobtools_results(df, summary_df): def create_files(df): + """ + + Creates final files. + + Args: + df (pd.Dataframe): dataframe for results to print to file(s). + + Creates: + file (file): Results files. A lot of them. + + """ + # columns for final files # For the results tab : top organism, serotypefinder/shigellatyper, seqsero2, coverage, warnings, blobtools, and kraken2 finished_cols = ['wgs_id', 'Description', 'organism', 'SerotypeFinder (E. coli)', 'SeqSero Organism (Salmonella)', 'Sample_Project', 'coverage', 'Pass', 'warnings', 'blobtools_organism_(per_mapped_reads)'] @@ -81,6 +123,19 @@ def create_files(df): def emmtyper_results(df, args): + """ + + Parses emmtyper output + + Args: + df (pd.Dataframe): dataframe for results thus far. + args (argparse.Namespace): Parsed command-line arguments. + + Returns: + df (pd.Dataframe): Pandas dataframe of the parsed output. + + """ + emmtyper_df = results_to_df(f"{args.grandeur}/emmtyper/", "\t", "_emmtyper.txt") if not emmtyper_df.empty: @@ -92,6 +147,19 @@ def emmtyper_results(df, args): def escherichia_serotype(df, summary_df, args): + """ + + Double checks output for Escherichia species + + Args: + df (pd.Dataframe): dataframe for results thus far. + summary_df (pd.Dataframe): dataframe of grandeur results. + args (argparse.Namespace): Parsed command-line arguments. + + Returns: + df (pd.Dataframe): Pandas dataframe of the parsed output. + + """ logging.info('Double checking Escherichia organism with shigatyper results') # creating a copy of the summary_df to just the Escherichia samples @@ -123,6 +191,19 @@ def escherichia_serotype(df, summary_df, args): def fastani_results(df, args): + """ + + Parses fastani output + + Args: + df (pd.Dataframe): dataframe for results thus far. + args (argparse.Namespace): Parsed command-line arguments. + + Returns: + df (pd.Dataframe): Pandas dataframe of the parsed output. + + """ + fastani_df = results_to_df(f"{args.grandeur}/fastani/", ',', '_fastani.csv') # Getting the WGS organism from fastani or mash @@ -139,14 +220,26 @@ def fastani_results(df, args): def grandeur_summary(df, args): - summary = args.grandeur + '/grandeur_summary.tsv' - logging.info('Extracting information from ' + summary) + """ + + Parses grandeur summary + + Args: + df (pd.Dataframe): dataframe for results thus far. + args (argparse.Namespace): Parsed command-line arguments. + + Returns: + df (pd.Dataframe): Pandas dataframe of the parsed output. + + """ + summary = f"{args.grandeur}/grandeur_summary.tsv" + logging.info(f"Extracting information from {summary}") # using results in summary file instead of the "original" - # blobtools = args.grandeur + '/blobtools/blobtools_summary.txt' - # kraken2 = args.grandeur + '/kraken2/kraken2_summary.csv' - # serotypefinder = args.grandeur + '/serotypefinder/serotypefinder_results.txt' - # shigatyper = args.grandeur + '/shigatyper/shigatyper_results.txt' + # blobtools = f"{args.grandeur}/blobtools/blobtools_summary.txt" + # kraken2 = f"{args.grandeur}/kraken2/kraken2_summary.csv" + # serotypefinder = f"{args.grandeur}/serotypefinder/serotypefinder_results.txt" + # shigatyper = f"{args.grandeur}/shigatyper/shigatyper_results.txt" # getting coverage summary_df = pd.read_table(summary) @@ -168,6 +261,18 @@ def grandeur_summary(df, args): def kraken2_results(df, summary_df): + """ + + Parses kraken2 output + + Args: + df (pd.Dataframe): dataframe for results thus far. + args (argparse.Namespace): Parsed command-line arguments. + + Returns: + df (pd.Dataframe): Pandas dataframe of the parsed output. + + """ logging.info("Getting kraken2 results") df = pd.merge(df,summary_df[['sample','kraken2_organism_(per_fragment)']],left_on='Sample_Name', right_on='sample', how='left') df = df.drop('sample', axis=1) @@ -176,6 +281,18 @@ def kraken2_results(df, summary_df): def mash_results(df, args): + """ + + Parses mash output + + Args: + df (pd.Dataframe): dataframe for results thus far. + args (argparse.Namespace): Parsed command-line arguments. + + Returns: + df (pd.Dataframe): Pandas dataframe of the parsed output. + + """ mash_df = results_to_df(f"{args.grandeur}/mash/", ",", "summary.mash.csv") if not mash_df.empty: @@ -196,6 +313,18 @@ def mash_results(df, args): def mlst_results(df, args): + """ + + Parses mlst output + + Args: + df (pd.Dataframe): dataframe for results thus far. + args (argparse.Namespace): Parsed command-line arguments. + + Returns: + df (pd.Dataframe): Pandas dataframe of the parsed output. + """ + mlst_df = results_to_df(f"{args.grandeur}/mlst/", "\t", "mlst.tsv") if not mlst_df.empty: @@ -207,10 +336,24 @@ def mlst_results(df, args): def pass_fail(df): + """ + + Uses coverage to set some basic pass/fail conditions. + + Args: + df (pd.Dataframe): results thus far. + + Returns: + df (pd.Dataframe): Pandas dataframe with 'Pass' column. + + """ + + # in general conditions df['Pass'] = 'TBD' df.loc[df['coverage'] >= 40, 'Pass'] = 'Y' df.loc[df['coverage'] < 20, 'Pass'] = 'X' + # organism specific conditions organisms = [ 'Acinetobacter', 'Citrobacter', @@ -243,6 +386,20 @@ def pass_fail(df): def results_to_df(path, delim, end): + """ + + Combines results for files into a dataframe + + Args: + path (str): directory with results stored. + delim (str): delimiter used in file ("," or "/t" are the most common). + end (str): The last characters of a filename. + + Returns: + df (pd.Dataframe): Pandas dataframe of the parsed output. + + """ + if not os.path.isdir(path): return pd.DataFrame() @@ -263,6 +420,18 @@ def results_to_df(path, delim, end): def sample_sheet_to_df(samplesheet): + """ + + Creates pandas dataframe from MiSeq sample sheet. + + Args: + samplesheet (str): path to sample sheet. + + Returns: + df (pd.Dataframe): Pandas dataframe of the parsed output. + + """ + logging.info(f"Getting samples from {samplesheet}") if os.path.exists(samplesheet): @@ -279,15 +448,29 @@ def sample_sheet_to_df(samplesheet): def seqsero2_results(df, args): - seqsero2_dir = args.grandeur + '/seqsero2/' + """ + + Parses seqsero2 output + + Args: + df (pd.Dataframe): dataframe for results thus far. + args (argparse.Namespace): Parsed command-line arguments. + + Returns: + df (pd.Dataframe): Pandas dataframe of the parsed output. + + """ + + seqsero2_dir = f"{args.grandeur}/seqsero2/" if not os.path.isdir(seqsero2_dir): return df - logging.info('Getting salmonella serotype information from ' + seqsero2_dir) + logging.info(f"Getting salmonella serotype information from {seqsero2_dir}") + # does not use results_to_df because of file structure dfs = [] - files = glob.glob(args.grandeur + '/seqsero2/*/SeqSero_result.tsv') + files = glob.glob(f"{args.grandeur}/seqsero2/*/SeqSero_result.tsv") for file in files: ind_df = pd.read_table(file, sep='\t') if not ind_df.empty: @@ -308,7 +491,22 @@ def seqsero2_results(df, args): return df + def main(): + """ + + Parses output from Grandeur version 3. + + Args: + args (argparse.Namespace): Parsed command-line arguments. + + Prints: + files (str): Files for Results tab and ARLN Regional tab. + + """ + + logging.basicConfig(format='%(asctime)s - %(message)s', datefmt = '%y-%b-%d %H:%M:%S', level=logging.INFO) + version = '0.1.24191' parser = argparse.ArgumentParser() From 06c6ca270cfb2809ef3336deb948fbdbccef297a Mon Sep 17 00:00:00 2001 From: eriny Date: Fri, 12 Jul 2024 12:20:18 -0600 Subject: [PATCH 7/9] added usage for grandeur_to_sheets.py --- README.md | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/README.md b/README.md index 4cc2c67..77c2294 100644 --- a/README.md +++ b/README.md @@ -201,6 +201,22 @@ The script will populate the vcf_files directory with the desired VCF files. Eac This script streamlines the process of collecting and organizing necessary data for mycoSNP analysis. It significantly reduces manual data handling and potential errors. +## grandeur_to_sheets.py + +This script takes a MiSeq sample sheet and the directory of grandeur results (regardless of where they were run) and creates two files to make it "easy" to get results into the "Finished" and "ARLN_regional" tabs. + +EXAMPLE: +```bash +python3 grandeur_to_sheets.py -g aws_results -s SampleSheet.csv +``` + +Four files are generated: +- arln_tab.tsv : tab-delimited results relevant to the "ARLN_regional" tab. +- arln_tab.txt : ";" -delimited results relevant to the "ARLN_regional" tab. +- finished_tab.tsv : tab-delimited results relevant to the "Finished" tab. +- finished_tab.txt : ";" -delimited results relevant to the "Finished" tab. + + ## changeseqids.py This script updates sequence identifiers in the 'vcf-to-fasta.fasta' file, which is an output file of mycoSNP. It ensures that sequence IDs in the 'vcf-to-fasta.fasta' file, used in creating a Newick file, align with the specific naming conventions set by the CDC's Mycotic Disease Branch. Users must place three specific files in the same directory as this script: From f63274431eae647c67670b1df8ecd048f00b097f Mon Sep 17 00:00:00 2001 From: eriny Date: Fri, 12 Jul 2024 16:09:04 -0600 Subject: [PATCH 8/9] uploading and downloading from aws --- aws_download.py | 93 ++++++++++++++++++++ aws_upload.py | 226 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 319 insertions(+) create mode 100755 aws_download.py create mode 100755 aws_upload.py diff --git a/aws_download.py b/aws_download.py new file mode 100755 index 0000000..e4ae1f9 --- /dev/null +++ b/aws_download.py @@ -0,0 +1,93 @@ +#!/usr/bin/env python3 + +''' +Author: Erin Young + +Description: + +This script will download all files from an AWS bucket to a +designated directory. + +EXAMPLE: +python3 aws_download.py -a UT-M07101-240710 -d /Volumes/IDGenomics_NAS/pulsenet_and_arln/UT-M07101-240710/aws_results + +''' + +import boto3 +import os +from concurrent.futures import ThreadPoolExecutor, as_completed +import argparse + +def download_file(s3_resource, bucket_name, s3_key, local_directory): + """ + Downloads a single file from S3 to a local directory. + + Args: + s3_resource (boto3.resource('s3')): Boto3 S3 resource. + bucket_name (str): Name of the S3 bucket. + s3_key (str): Key of the file in S3. + local_directory (str): Local directory path where the file will be saved. + """ + local_path = os.path.join(local_directory, os.path.basename(s3_key)) + try: + s3_resource.Bucket(bucket_name).download_file(s3_key, local_path) + print(f'Downloaded {s3_key} to {local_path}') + return s3_key, True + except Exception as e: + print(f'Error downloading {s3_key}: {e}') + return s3_key, False + + +def download_directory(bucket_name, s3_directory, local_directory, profile_name, region_name): + """ + Downloads all files from a directory in an S3 bucket to a local directory in parallel. + + Args: + bucket_name (str): Name of the S3 bucket. + s3_directory (str): Directory in the S3 bucket to download from. + local_directory (str): Local directory path where files will be saved. + profile_name (str): AWS profile name configured in your AWS credentials. + region_name (str): AWS region where the S3 bucket is located. + """ + # Create a session using the specified profile and region + session = boto3.Session(profile_name=profile_name, region_name=region_name) + s3_resource = session.resource('s3') + + # Ensure the local directory exists + os.makedirs(local_directory, exist_ok=True) + + # List all objects in the S3 directory + bucket = s3_resource.Bucket(bucket_name) + objects = bucket.objects.filter(Prefix=s3_directory) + + # Download files in parallel + with ThreadPoolExecutor() as executor: + futures = [] + for obj in objects: + if obj.key == s3_directory: # Skip the directory itself + continue + futures.append(executor.submit(download_file, s3_resource, bucket_name, obj.key, local_directory)) + + # Process completed futures + for future in as_completed(futures): + s3_key, success = future.result() + if success: + print(f'Successfully downloaded {s3_key}') + else: + print(f'Failed to download {s3_key}') + + +def main(): + parser = argparse.ArgumentParser(description='Download files from S3 bucket directory.') + parser.add_argument('-b', '--bucket', help='Name of the S3 bucket', default='dhhs-uphl-omics-outputs-dev', required=False) + parser.add_argument('-a', '--awsdir', help='Directory in the S3 bucket to download from', required=True) + parser.add_argument('-d', '--dir', help='Local directory path where files will be saved', required=True) + parser.add_argument('-r', '--region', type=str, help='AWS bucket region', default='us-west-2', required=False) + parser.add_argument('-p', '--profile', type=str, help='AWS credential profile', default='155221691104_dhhs-uphl-biongs-dev', required=False) + args = parser.parse_args() + + download_directory(args.bucket, args.awsdir, args.dir, args.profile, args.region) + + +if __name__ == '__main__': + main() diff --git a/aws_upload.py b/aws_upload.py new file mode 100755 index 0000000..b926e32 --- /dev/null +++ b/aws_upload.py @@ -0,0 +1,226 @@ +#!/usr/bin/env python3 + +''' +Author: Erin Young + +Description: + +This script will create two directories with the same name in +dhhs-uphl-omics-inputs-dev and dhhs-uphl-omics-outputs-dev designated +by '-a'. Then it will upload the files designated by '-f' and/or +everything in the directory designated by '-d'. + +If the directory already exists, this script is intended to fail unless +'-F' or '--force' is used. + +EXAMPLE: +python3 aws_upload.py -a UT-M07101-240710 -d /Volumes/IDGenomics_NAS/pulsenet_and_arln/UT-M07101-240710/reads + +python3 aws_upload.py -a UT-M07101-240710 -f /Volumes/IDGenomics_NAS/pulsenet_and_arln/UT-M07101-240710/reads/SampleSheet.csv + +''' + +# may need to configure with aws configure --profile 155221691104_dhhs-uphl-biongs-dev +import argparse +import os +import boto3 +import logging +import sys +from concurrent.futures import ThreadPoolExecutor, as_completed + +def check_directory(bucket_name, directory_name, s3): + """ + Checks if directory exists in S3 bucket. + + Args: + bucket_name (str): Bucket name. + directory_name (str): Directory name to check. + s3 (session.client): AWS session.client('s3'). + + Returns: + boolean: True if directory exists, False otherwise. + """ + try: + response = s3.head_object(Bucket=bucket_name, Key=f'{directory_name}/') + return True + except: + return False + + +def create_directory(bucket_name, directory_name, s3, force): + """ + Creates directory in S3 bucket if it doesn't already exist. + + Args: + bucket_name (str): Bucket name. + directory_name (str): Directory name to create. + s3 (session.client): AWS session.client('s3'). + force (bool): Whether to force creation if directory already exists. + """ + if check_directory(bucket_name, directory_name, s3): + logging.warning(f'Directory {directory_name} already exists in {bucket_name}!') + + if not force: + logging.fatal(f'Use \'--force\' to upload to directory anyway.') + sys.exit(1) + else: + return + + else: + try: + s3.put_object(Bucket=bucket_name, Key=f'{directory_name}/') + logging.info(f'Directory {directory_name} created in bucket {bucket_name}') + except Exception as e: + logging.error(f'Error creating directory {directory_name} in bucket {bucket_name}: {e}') + sys.exit(1) + + +def upload_dir(bucket_name, s3_directory, local_directory, s3): + """ + Uploads entire directory to S3. + + Args: + bucket_name (str): Bucket name. + s3_directory (str): Directory path in S3. + local_directory (str): Local directory path. + s3 (session.client): AWS session.client('s3'). + """ + logging.info(f'Uploading everything in {local_directory} to {s3_directory}') + + # Ensure the directory name ends with a '/' + if not s3_directory.endswith('/'): + s3_directory += '/' + + # Create a list to hold the files to be uploaded + files_to_upload = [] + + # Walk through the local directory to collect file paths + for root, dirs, files in os.walk(local_directory): + for file in files: + local_path = os.path.join(root, file) + relative_path = os.path.relpath(local_path, local_directory) + s3_path = os.path.join(s3_directory, relative_path) + files_to_upload.append((s3_path, local_path)) + + # Upload files in parallel + with ThreadPoolExecutor(max_workers=4) as executor: + futures = [executor.submit(upload_file_from_dir, s3, bucket_name, s3_path, local_path) for s3_path, local_path in files_to_upload] + for future in as_completed(futures): + local_path, success = future.result() + if success: + logging.info(f'Successfully uploaded {local_path}') + else: + logging.error(f'Failed to upload {local_path}') + + +def upload_file(s3, bucket_name, s3_directory, local_file): + """ + Uploads files to base of directory in bucket. + + Args: + s3 (session.client): AWS session.client('s3'). + bucket_name (str): Bucket name. + s3_path (str): Directory path in S3. + local_path (str): Local file path. + """ + + try: + s3_path = os.path.join(s3_directory, os.path.basename(local_file)) + s3.upload_file(local_file, bucket_name, s3_path) + logging.info(f'Uploaded {local_file} to s3://{bucket_name}/{s3_path}') + return local_file, True + except Exception as e: + logging.error(f'Error uploading {local_file}: {e}') + return local_file, False + + +def upload_file_from_dir(s3, bucket_name, s3_path, local_path): + """ + Uploads files to directory in bucket. + + Args: + s3 (session.client): AWS session.client('s3'). + bucket_name (str): Bucket name. + s3_path (str): Directory path in S3. + local_path (str): Local file path. + """ + logging.info(f'Uploading {local_path} to {s3_path}') + try: + s3.upload_file(local_path, bucket_name, s3_path) + logging.info(f'Uploaded {local_path} to s3://{bucket_name}/{s3_path}') + return local_path, True + except Exception as e: + logging.error(f'Error uploading {local_path} to s3://{bucket_name}/{s3_path}: {e}') + return local_path, False + + +def upload_files(bucket_name, s3_directory, files, s3): + """ + Uploads specified files to S3 directory. + + Args: + bucket_name (str): Bucket name. + s3_directory (str): Directory path in S3. + files (list): List of file paths to upload. + s3 (session.client): AWS session.client('s3'). + """ + logging.info(f'Uploading file(s) {files} to {s3_directory}') + + # Ensure the directory name ends with a '/' + if not s3_directory.endswith('/'): + s3_directory += '/' + + # Upload files in parallel + with ThreadPoolExecutor() as executor: + futures = [executor.submit(upload_file, s3, bucket_name, s3_directory, file) for file in files] + for future in as_completed(futures): + file, success = future.result() + if success: + logging.info(f'Successfully uploaded {file}') + else: + logging.warning(f'Failed to upload {file}') + + +def main(): + """ + Main function to handle command line arguments and execute script. + """ + logging.basicConfig(format='%(asctime)s - %(message)s', datefmt='%y-%b-%d %H:%M:%S', level=logging.INFO) + + version = '0.1.0' + + parser = argparse.ArgumentParser() + parser.add_argument('-a', '--awsdir', type=str, help='Directory in AWS S3 to create/upload to', required=True) + parser.add_argument('-d', '--dir', type=str, help='Local directory with files to upload', required=False) + parser.add_argument('-f', '--file', type=str, nargs='+', help='File(s) to upload', required=False) + parser.add_argument('-v', '--version', help='Print version and exit', action='version', version='%(prog)s ' + version) + parser.add_argument('-b', '--input_bucket', type=str, help='Name of S3 input bucket', default='dhhs-uphl-omics-inputs-dev', required=False) + parser.add_argument('-B', '--output_bucket', type=str, help='Name of S3 output bucket', default='dhhs-uphl-omics-outputs-dev', required=False) + parser.add_argument('-F', '--force', action='store_true', help='Force upload to bucket that already exists', required=False) + parser.add_argument('-r', '--region', type=str, help='AWS bucket region', default='us-west-2', required=False) + parser.add_argument('-p', '--profile', type=str, help='AWS credential profile', default='155221691104_dhhs-uphl-biongs-dev', required=False) + + args = parser.parse_args() + + session = boto3.Session(profile_name=args.profile, region_name=args.region) + s3 = session.client('s3') + + # Create directories for workflows + create_directory(args.input_bucket, args.awsdir, s3, args.force) + create_directory(args.output_bucket, args.awsdir, s3, args.force) + + # Upload files + if args.file and len(args.file) > 0: + upload_files(args.input_bucket, args.awsdir, args.file, s3) + + # Upload contents of directory + if args.dir: + if os.path.isdir(args.dir): + upload_dir(args.input_bucket, args.awsdir, args.dir, s3) + else: + logging.fatal(f'{args.dir} does not exist!') + sys.exit(1) + + +if __name__ == '__main__': + main() From 6215a89381b94142a752ccab286c2d43321c6172 Mon Sep 17 00:00:00 2001 From: eriny Date: Mon, 15 Jul 2024 12:16:13 -0600 Subject: [PATCH 9/9] fixed ecoli shigella serotyping --- grandeur_to_sheets.py | 169 +++++++++++++++++++++++++----------------- 1 file changed, 102 insertions(+), 67 deletions(-) diff --git a/grandeur_to_sheets.py b/grandeur_to_sheets.py index f07ce8c..fb5bda7 100755 --- a/grandeur_to_sheets.py +++ b/grandeur_to_sheets.py @@ -15,19 +15,22 @@ python3 grandeur_to_sheets.py -g -s ''' +# ignore line too long warnings +# pylint: disable=C0301 + # trying to keep dependencies really low import argparse -import pandas as pd import os +import sys import logging import glob +import pandas as pd + # local files from read_miseq_sample_sheet import read_miseq_sample_sheet - - def amrfinder_results(df, args): """ @@ -60,7 +63,7 @@ def amrfinder_results(df, args): vir_df['virulence genes'] = vir_df['Gene symbol'] df = pd.merge(df,vir_df[['Name','virulence genes']],left_on='Sample_Name', right_on='Name', how='left') df = df.drop('Name', axis=1) - + return df @@ -146,48 +149,7 @@ def emmtyper_results(df, args): return df -def escherichia_serotype(df, summary_df, args): - """ - Double checks output for Escherichia species - - Args: - df (pd.Dataframe): dataframe for results thus far. - summary_df (pd.Dataframe): dataframe of grandeur results. - args (argparse.Namespace): Parsed command-line arguments. - - Returns: - df (pd.Dataframe): Pandas dataframe of the parsed output. - - """ - logging.info('Double checking Escherichia organism with shigatyper results') - - # creating a copy of the summary_df to just the Escherichia samples - ecoli_df = summary_df[summary_df['mash_organism'].str.contains('Shigella', na=False) | summary_df['mash_organism'].str.contains('Escherichia', na=False) ].copy() - ecoli_df['serotypefinder_Serotype_O'] = ecoli_df['serotypefinder_Serotype_O'].fillna("none") - ecoli_df['serotypefinder_Serotype_H'] = ecoli_df['serotypefinder_Serotype_H'].fillna("none") - ecoli_df['O_H'] = ecoli_df['serotypefinder_Serotype_O'].astype(str) + ':' + ecoli_df['serotypefinder_Serotype_H'].astype(str) - ecoli_df['ecoli_organism'] = 'Escherichia coli' - - # making sure the top Shigella organism makes it to the spreadsheet for each ipaH+ sample - if 'shigatyper_hit' in summary_df.columns and 'mash_organism' in summary_df.columns: - shigella_df = ecoli_df[ecoli_df['shigatyper_hit'].str.contains('ipaH')] - for sample in shigella_df['sample'].tolist(): - organism = 'Escherichia coli' - with open(f"{args.grandeur}/mash/{sample}.summary.mash.csv") as file: - for line in file: - line = line.strip() - if 'Shigella' in line: - organism = line.split(',')[-1].replace('_', ' ') - break - ecoli_df.loc[ecoli_df['sample'] == sample, 'ecoli_organism'] = organism - - ecoli_df['SerotypeFinder (E. coli)'] = ecoli_df['ecoli_organism'] + " " + ecoli_df['O_H'] - - df = pd.merge(df, ecoli_df[['sample','SerotypeFinder (E. coli)']],left_on='Sample_Name', right_on='sample', how='left') - df = df.drop('sample', axis=1) - - return df def fastani_results(df, args): @@ -219,6 +181,42 @@ def fastani_results(df, args): return df + +def fix_escherichia(row, directory): + """ + + Checking for ipaH. + + Args: + row (pd.Dataframe): dataframe row + + Returns: + organism (str): Predicted organism. + + """ + sample = row['Sample_Name'] + organism = row['organism'] + shiga_hit = row['shigatyper_hit'] + + if pd.notna(row['ecoli_O_H']) and pd.notna(row['shigatyper_hit']): + if 'IpaH' in shiga_hit: + org_check = 'Shigella' + else: + org_check = 'Escherichia' + + fastani_file = f"{directory}/fastani/{sample}_fastani.csv" + if os.path.exists(fastani_file): + with open(fastani_file) as file: + for line in file: + line = line.strip() + if org_check in line: + ref = line.split(',')[2].split('_') + organism = f"{ref[0]} {ref[1]}" + break + + return organism + + def grandeur_summary(df, args): """ @@ -248,8 +246,8 @@ def grandeur_summary(df, args): df['coverage'] = df['coverage'].fillna(0) df['coverage'] = df['coverage'].round(2) - if 'serotypefinder_Serotype_O' in summary_df.columns: - df = escherichia_serotype(df, summary_df, args) + if 'shigatyper_hit' in summary_df.columns and 'serotypefinder_Serotype_O' in summary_df.columns and 'serotypefinder_Serotype_H' in summary_df.columns: + df = serotypefinder_results(df, summary_df) if 'kraken2_organism_(per_fragment)' in summary_df.columns: df = kraken2_results(df, summary_df) @@ -298,7 +296,7 @@ def mash_results(df, args): if not mash_df.empty: mash_df = mash_df.sort_values(by=['P-value', 'mash-distance']) mash_df = mash_df.drop_duplicates(subset=['sample'], keep = 'first') - + df = pd.merge(df,mash_df[['sample','organism']],left_on='Sample_Name', right_on='sample', how='left') df = df.drop('sample', axis=1) @@ -335,7 +333,7 @@ def mlst_results(df, args): return df -def pass_fail(df): +def pass_fail(df, args): """ Uses coverage to set some basic pass/fail conditions. @@ -382,6 +380,14 @@ def pass_fail(df): df = df.sort_values('wgs_id') df['organism'] = df['organism'].str.replace('_',' ',regex=False) + + # fix shigella/ecoli mixups + if 'ecoli_O_H' in df.columns and 'shigatyper_hit' in df.columns and 'mash_organism' in df.columns: + df['organism'] = df.apply(fix_escherichia, axis=1, directory = args.grandeur) + + df['SerotypeFinder (E. coli)'] = None + df['SerotypeFinder (E. coli)'] = df.apply(lambda row: f"{row['organism']} {row['ecoli_O_H']}" if pd.notna(row['ecoli_O_H']) else row['SerotypeFinder (E. coli)'], axis=1) + return df @@ -402,7 +408,7 @@ def results_to_df(path, delim, end): if not os.path.isdir(path): return pd.DataFrame() - + logging.info(f"Getting information from {path}") dfs = [] @@ -416,7 +422,7 @@ def results_to_df(path, delim, end): if dfs: return pd.concat(dfs, ignore_index=True) else: - return pd.DataFrame() + return pd.DataFrame() def sample_sheet_to_df(samplesheet): @@ -441,10 +447,10 @@ def sample_sheet_to_df(samplesheet): df['wgs_id'] = df['Sample_Name'].str.replace('-UT.*','', regex=True) return df - + else: logging.fatal('Sample sheet could not be located! (Specify with -s)') - exit(1) + sys.exit(1) def seqsero2_results(df, args): @@ -462,10 +468,10 @@ def seqsero2_results(df, args): """ seqsero2_dir = f"{args.grandeur}/seqsero2/" - + if not os.path.isdir(seqsero2_dir): return df - + logging.info(f"Getting salmonella serotype information from {seqsero2_dir}") # does not use results_to_df because of file structure @@ -474,7 +480,7 @@ def seqsero2_results(df, args): for file in files: ind_df = pd.read_table(file, sep='\t') if not ind_df.empty: - dfs.append(ind_df) + dfs.append(ind_df) if dfs: seqsero2_df = pd.concat(dfs, ignore_index=True) @@ -492,6 +498,35 @@ def seqsero2_results(df, args): return df +def serotypefinder_results(df, summary_df): + """ + + Parses serotypefinder output + + Args: + df (pd.Dataframe): dataframe for results thus far. + summary_df (pd.Dataframe): dataframe of grandeur results. + args (argparse.Namespace): Parsed command-line arguments. + + Returns: + df (pd.Dataframe): Pandas dataframe of the parsed output. + + """ + logging.info('Double checking Escherichia organism with shigatyper results') + + if 'shigatyper_hit' in summary_df.columns and 'serotypefinder_Serotype_O' in summary_df.columns and 'serotypefinder_Serotype_H' in summary_df.columns: + # creating a copy of the summary_df to just the Escherichia samples + ecoli_df = summary_df[summary_df['mash_organism'].str.contains('Shigella', na=False) | summary_df['mash_organism'].str.contains('Escherichia', na=False) ].copy() + ecoli_df['serotypefinder_Serotype_O'] = ecoli_df['serotypefinder_Serotype_O'].fillna("none") + ecoli_df['serotypefinder_Serotype_H'] = ecoli_df['serotypefinder_Serotype_H'].fillna("none") + ecoli_df['ecoli_O_H'] = ecoli_df['serotypefinder_Serotype_O'].astype(str) + ':' + ecoli_df['serotypefinder_Serotype_H'].astype(str) + + df = pd.merge(df, ecoli_df[['sample','ecoli_O_H', 'shigatyper_hit', 'mash_organism']],left_on='Sample_Name', right_on='sample', how='left') + df = df.drop('sample', axis=1) + + return df + + def main(): """ @@ -505,7 +540,7 @@ def main(): """ - logging.basicConfig(format='%(asctime)s - %(message)s', datefmt = '%y-%b-%d %H:%M:%S', level=logging.INFO) + logging.basicConfig(format='%(asctime)s - %(message)s', datefmt = '%y-%b-%d %H:%M:%S', level=logging.INFO) version = '0.1.24191' @@ -517,24 +552,24 @@ def main(): df = sample_sheet_to_df(args.samplesheet) - df = grandeur_summary( df, args) + df = grandeur_summary( df, args) - df = fastani_results( df, args) + df = fastani_results( df, args) - df = mash_results( df, args) + df = mash_results( df, args) - df = seqsero2_results( df, args) + df = seqsero2_results( df, args) - df = mlst_results( df, args) + df = mlst_results( df, args) - df = emmtyper_results( df, args) + df = emmtyper_results( df, args) - df = amrfinder_results( df, args) + df = amrfinder_results( df, args) - df = pass_fail( df) + df = pass_fail(df, args) - create_files( df) + create_files(df) if __name__ == "__main__": - main() \ No newline at end of file + main()