From 5df4a7a64d59f2382bf63df3dcdba2799f4f6eae Mon Sep 17 00:00:00 2001 From: Jack Huey <31162821+jackh726@users.noreply.github.com> Date: Fri, 18 Aug 2023 21:50:20 -0400 Subject: [PATCH] Convert bigbedtobed to derive parser and implement extra options --- src/bin/bigbedtobed.rs | 185 ++++++++++++++++++++++++++++++++++------- 1 file changed, 157 insertions(+), 28 deletions(-) diff --git a/src/bin/bigbedtobed.rs b/src/bin/bigbedtobed.rs index faf1318..6c83bad 100644 --- a/src/bin/bigbedtobed.rs +++ b/src/bin/bigbedtobed.rs @@ -1,8 +1,13 @@ +use std::env; use std::error::Error; +use std::ffi::OsString; use std::fs::File; -use std::io::{self, Write}; +use std::io::{self, BufReader, Write}; +use std::path::Path; +use std::str::FromStr; -use clap::{Arg, Command}; +use bigtools::utils::streaming_linereader::StreamingLineReader; +use clap::Parser; use futures::task::SpawnExt; @@ -15,6 +20,9 @@ pub fn write_bed( bigbed: BigBedRead, mut out_file: File, nthreads: usize, + chrom: Option, + start: Option, + end: Option, ) -> Result<(), BBIReadError> { let pool = futures::executor::ThreadPoolBuilder::new() .pool_size(nthreads) @@ -24,6 +32,7 @@ pub fn write_bed( let chrom_files: Vec)>> = bigbed .get_chroms() .into_iter() + .filter(|c| chrom.as_ref().map_or(true, |chrom| &c.name == chrom)) .map(|chrom| { let bigbed = bigbed.reopen()?; let (buf, file): (TempFileBuffer, TempFileBufferWriter) = @@ -33,9 +42,21 @@ pub fn write_bed( mut bigbed: BigBedRead, chrom: ChromAndSize, mut writer: io::BufWriter>, + start: Option, + end: Option, ) -> Result<(), BBIReadError> { for raw_val in bigbed.get_interval(&chrom.name, 0, chrom.length)? { let val = raw_val?; + if let Some(start) = start { + if val.start <= start { + continue; + } + } + if let Some(end) = end { + if val.start > end { + continue; + } + } let end = if !val.rest.is_empty() { format!("\t{}\n", val.rest) } else { @@ -49,7 +70,7 @@ pub fn write_bed( Ok(()) } let handle = pool - .spawn_with_handle(file_future(bigbed, chrom, writer)) + .spawn_with_handle(file_future(bigbed, chrom, writer, start, end)) .expect("Couldn't spawn."); Ok((handle, buf)) }) @@ -65,36 +86,144 @@ pub fn write_bed( Ok(()) } +pub fn write_bed_from_bed( + mut bigbed: BigBedRead, + out_file: File, + bed: File, +) -> Result<(), BBIReadError> { + let mut bedstream = StreamingLineReader::new(BufReader::new(bed)); + let mut writer = io::BufWriter::new(out_file); + + while let Some(line) = bedstream.read() { + let line = line?; + let mut split = line.trim().splitn(5, '\t'); + let chrom = split.next().expect("Missing chrom"); + let start = split.next().expect("Missing start").parse::().unwrap(); + let end = split.next().expect("Missing end").parse::().unwrap(); + for raw_val in bigbed.get_interval(chrom, start, end)? { + let val = raw_val?; + let end = if !val.rest.is_empty() { + format!("\t{}\n", val.rest) + } else { + "\n".to_string() + }; + writer.write_fmt(format_args!("{}\t{}\t{}{}", chrom, val.start, val.end, end))?; + } + } + + Ok(()) +} + +#[derive(Parser)] +#[command(about = "Converts an input bigBed to a bed. Can be multi-threaded for substantial speedups. Note for roughly each core, one temporary file will be opened.", long_about = None)] +struct Cli { + /// the bigbed to get convert to bed + big_bed: String, + + /// the path of the bed to output to + bed: String, + + /// If set, restrict output to given chromosome + #[arg(long)] + chrom: Option, + + /// If set, restrict output to regions greater than or equal to it + #[arg(long)] + start: Option, + + /// If set, restrict output to regions less than it + #[arg(long)] + end: Option, + + /// If set, restrict output to regions overlapping the bed file + overlap_bed: Option, + + /// Set the number of threads to use. This tool will nearly always benefit from more cores (<= # chroms). Note: for parts of the runtime, the actual usage may be nthreads+1 + #[arg(short = 't', long)] + #[arg(default_value_t = 6)] + pub nthreads: usize, +} + fn main() -> Result<(), Box> { - let matches = Command::new("BigBedToBedGraph") - .about("Converts an input bigBed to a bed. Can be multi-threaded for substantial speedups. Note for roughly each core, one temporary file will be opened.") - .arg(Arg::new("bigbed") - .help("the bigbed to get convert to bed") - .index(1) - .required(true) - ) - .arg(Arg::new("bed") - .help("the path of the bed to output to") - .index(2) - .required(true) - ) - .arg(Arg::new("nthreads") - .short('t') - .help("Set the number of threads to use. This tool will nearly always benefit from more cores (<= # chroms). Note: for parts of the runtime, the actual usage may be nthreads+1") - .num_args(1) - .default_value("6") - .value_parser(clap::value_parser!(usize))) - .get_matches(); - - let bigbedpath = matches.get_one::("bigbed").unwrap().to_owned(); - let bedpath = matches.get_one::("bed").unwrap().to_owned(); - - let nthreads = *matches.get_one::("nthreads").unwrap(); + let args = env::args_os().map(|a| { + match a.to_str() { + Some(b) if b.starts_with("-chrom=") => { + return OsString::from_str(&format!("--chrom={}", b.replace("-chrom=", ""))) + .unwrap() + } + Some(b) if b.starts_with("-start=") => { + return OsString::from_str(&format!("--start={}", b.replace("-start=", ""))) + .unwrap() + } + Some(b) if b.starts_with("-end=") => { + return OsString::from_str(&format!("--end={}", b.replace("-end=", ""))).unwrap() + } + Some(b) if b.starts_with("-bed=") => { + return OsString::from_str(&format!("--overlap-bed={}", b.replace("-bed=", ""))) + .unwrap() + } + Some("-header") => { + panic!( + "Unimplemented compatibility option {}.", + a.to_string_lossy() + ); + } + Some(b) if b.starts_with("-udcDir") => { + panic!( + "Unimplemented compatibility option {}.", + a.to_string_lossy() + ); + } + _ => {} + } + a + }); + let matches = Cli::parse_from(args); + + let bigbedpath = matches.big_bed; + let bedpath = matches.bed; + + let nthreads = matches.nthreads; let bigbed = BigBedRead::open_file(&bigbedpath)?; let bed = File::create(bedpath)?; - write_bed(bigbed, bed, nthreads)?; + if matches.start.is_some() || matches.end.is_some() & matches.chrom.is_none() { + eprintln!("Cannot specify --start or --end without specifying --chrom."); + return Ok(()); + } + + if matches.chrom.is_some() && matches.overlap_bed.is_some() { + eprintln!("Cannot specify both --overlap-bed and interval to overlap."); + return Ok(()); + } + + match matches.overlap_bed { + Some(overlap_bed) => { + if !Path::exists(&Path::new(&overlap_bed)) { + eprintln!("Overlap bed file does not exist."); + return Ok(()); + } + let overlap_bed = File::open(overlap_bed)?; + write_bed_from_bed(bigbed, bed, overlap_bed)?; + } + None => { + write_bed( + bigbed, + bed, + nthreads, + matches.chrom, + matches.start, + matches.end, + )?; + } + } Ok(()) } + +#[test] +fn verify_cli_bigbedtobed() { + use clap::CommandFactory; + Cli::command().debug_assert() +}