Skip to content

Commit

Permalink
Convert bigbedtobed to derive parser and implement extra options
Browse files Browse the repository at this point in the history
  • Loading branch information
jackh726 committed Aug 19, 2023
1 parent 97b917c commit 5df4a7a
Showing 1 changed file with 157 additions and 28 deletions.
185 changes: 157 additions & 28 deletions src/bin/bigbedtobed.rs
Original file line number Diff line number Diff line change
@@ -1,8 +1,13 @@
use std::env;
use std::error::Error;
use std::ffi::OsString;
use std::fs::File;
use std::io::{self, Write};
use std::io::{self, BufReader, Write};
use std::path::Path;
use std::str::FromStr;

use clap::{Arg, Command};
use bigtools::utils::streaming_linereader::StreamingLineReader;
use clap::Parser;

use futures::task::SpawnExt;

Expand All @@ -15,6 +20,9 @@ pub fn write_bed<R: Reopen + SeekableRead + Send + 'static>(
bigbed: BigBedRead<R>,
mut out_file: File,
nthreads: usize,
chrom: Option<String>,
start: Option<u32>,
end: Option<u32>,
) -> Result<(), BBIReadError> {
let pool = futures::executor::ThreadPoolBuilder::new()
.pool_size(nthreads)
Expand All @@ -24,6 +32,7 @@ pub fn write_bed<R: Reopen + SeekableRead + Send + 'static>(
let chrom_files: Vec<io::Result<(_, TempFileBuffer<File>)>> = bigbed
.get_chroms()
.into_iter()
.filter(|c| chrom.as_ref().map_or(true, |chrom| &c.name == chrom))
.map(|chrom| {
let bigbed = bigbed.reopen()?;
let (buf, file): (TempFileBuffer<File>, TempFileBufferWriter<File>) =
Expand All @@ -33,9 +42,21 @@ pub fn write_bed<R: Reopen + SeekableRead + Send + 'static>(
mut bigbed: BigBedRead<R>,
chrom: ChromAndSize,
mut writer: io::BufWriter<TempFileBufferWriter<File>>,
start: Option<u32>,
end: Option<u32>,
) -> Result<(), BBIReadError> {
for raw_val in bigbed.get_interval(&chrom.name, 0, chrom.length)? {
let val = raw_val?;
if let Some(start) = start {
if val.start <= start {
continue;
}
}
if let Some(end) = end {
if val.start > end {
continue;
}
}
let end = if !val.rest.is_empty() {
format!("\t{}\n", val.rest)
} else {
Expand All @@ -49,7 +70,7 @@ pub fn write_bed<R: Reopen + SeekableRead + Send + 'static>(
Ok(())
}
let handle = pool
.spawn_with_handle(file_future(bigbed, chrom, writer))
.spawn_with_handle(file_future(bigbed, chrom, writer, start, end))
.expect("Couldn't spawn.");
Ok((handle, buf))
})
Expand All @@ -65,36 +86,144 @@ pub fn write_bed<R: Reopen + SeekableRead + Send + 'static>(
Ok(())
}

pub fn write_bed_from_bed<R: Reopen + SeekableRead + Send + 'static>(
mut bigbed: BigBedRead<R>,
out_file: File,
bed: File,
) -> Result<(), BBIReadError> {
let mut bedstream = StreamingLineReader::new(BufReader::new(bed));
let mut writer = io::BufWriter::new(out_file);

while let Some(line) = bedstream.read() {
let line = line?;
let mut split = line.trim().splitn(5, '\t');
let chrom = split.next().expect("Missing chrom");
let start = split.next().expect("Missing start").parse::<u32>().unwrap();
let end = split.next().expect("Missing end").parse::<u32>().unwrap();
for raw_val in bigbed.get_interval(chrom, start, end)? {
let val = raw_val?;
let end = if !val.rest.is_empty() {
format!("\t{}\n", val.rest)
} else {
"\n".to_string()
};
writer.write_fmt(format_args!("{}\t{}\t{}{}", chrom, val.start, val.end, end))?;
}
}

Ok(())
}

#[derive(Parser)]
#[command(about = "Converts an input bigBed to a bed. Can be multi-threaded for substantial speedups. Note for roughly each core, one temporary file will be opened.", long_about = None)]
struct Cli {
/// the bigbed to get convert to bed
big_bed: String,

/// the path of the bed to output to
bed: String,

/// If set, restrict output to given chromosome
#[arg(long)]
chrom: Option<String>,

/// If set, restrict output to regions greater than or equal to it
#[arg(long)]
start: Option<u32>,

/// If set, restrict output to regions less than it
#[arg(long)]
end: Option<u32>,

/// If set, restrict output to regions overlapping the bed file
overlap_bed: Option<String>,

/// Set the number of threads to use. This tool will nearly always benefit from more cores (<= # chroms). Note: for parts of the runtime, the actual usage may be nthreads+1
#[arg(short = 't', long)]
#[arg(default_value_t = 6)]
pub nthreads: usize,
}

fn main() -> Result<(), Box<dyn Error>> {
let matches = Command::new("BigBedToBedGraph")
.about("Converts an input bigBed to a bed. Can be multi-threaded for substantial speedups. Note for roughly each core, one temporary file will be opened.")
.arg(Arg::new("bigbed")
.help("the bigbed to get convert to bed")
.index(1)
.required(true)
)
.arg(Arg::new("bed")
.help("the path of the bed to output to")
.index(2)
.required(true)
)
.arg(Arg::new("nthreads")
.short('t')
.help("Set the number of threads to use. This tool will nearly always benefit from more cores (<= # chroms). Note: for parts of the runtime, the actual usage may be nthreads+1")
.num_args(1)
.default_value("6")
.value_parser(clap::value_parser!(usize)))
.get_matches();

let bigbedpath = matches.get_one::<String>("bigbed").unwrap().to_owned();
let bedpath = matches.get_one::<String>("bed").unwrap().to_owned();

let nthreads = *matches.get_one::<usize>("nthreads").unwrap();
let args = env::args_os().map(|a| {
match a.to_str() {
Some(b) if b.starts_with("-chrom=") => {
return OsString::from_str(&format!("--chrom={}", b.replace("-chrom=", "")))
.unwrap()
}
Some(b) if b.starts_with("-start=") => {
return OsString::from_str(&format!("--start={}", b.replace("-start=", "")))
.unwrap()
}
Some(b) if b.starts_with("-end=") => {
return OsString::from_str(&format!("--end={}", b.replace("-end=", ""))).unwrap()
}
Some(b) if b.starts_with("-bed=") => {
return OsString::from_str(&format!("--overlap-bed={}", b.replace("-bed=", "")))
.unwrap()
}
Some("-header") => {
panic!(
"Unimplemented compatibility option {}.",
a.to_string_lossy()
);
}
Some(b) if b.starts_with("-udcDir") => {
panic!(
"Unimplemented compatibility option {}.",
a.to_string_lossy()
);
}
_ => {}
}
a
});
let matches = Cli::parse_from(args);

let bigbedpath = matches.big_bed;
let bedpath = matches.bed;

let nthreads = matches.nthreads;

let bigbed = BigBedRead::open_file(&bigbedpath)?;
let bed = File::create(bedpath)?;

write_bed(bigbed, bed, nthreads)?;
if matches.start.is_some() || matches.end.is_some() & matches.chrom.is_none() {
eprintln!("Cannot specify --start or --end without specifying --chrom.");
return Ok(());
}

if matches.chrom.is_some() && matches.overlap_bed.is_some() {
eprintln!("Cannot specify both --overlap-bed and interval to overlap.");
return Ok(());
}

match matches.overlap_bed {
Some(overlap_bed) => {
if !Path::exists(&Path::new(&overlap_bed)) {
eprintln!("Overlap bed file does not exist.");
return Ok(());
}
let overlap_bed = File::open(overlap_bed)?;
write_bed_from_bed(bigbed, bed, overlap_bed)?;
}
None => {
write_bed(
bigbed,
bed,
nthreads,
matches.chrom,
matches.start,
matches.end,
)?;
}
}

Ok(())
}

#[test]
fn verify_cli_bigbedtobed() {
use clap::CommandFactory;
Cli::command().debug_assert()
}

0 comments on commit 5df4a7a

Please sign in to comment.