Skip to content

Commit

Permalink
Merge pull request #40 from stjude/fix
Browse files Browse the repository at this point in the history
Fix
  • Loading branch information
rawagiha committed May 2, 2023
2 parents 8439515 + 37ca13a commit 1ff9c57
Show file tree
Hide file tree
Showing 10 changed files with 636 additions and 15 deletions.
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
python>=3.6
python>=3.8
ssw-py>=1.0.1
pandas>=0.23.0
scikit-learn>=0.22.0
indelpost>=0.0.4
12 changes: 9 additions & 3 deletions rnaindel/analysis/alignment_feature_calculator.py
Original file line number Diff line number Diff line change
Expand Up @@ -236,8 +236,14 @@ def read_support_features(valn, downsample_threshold=1500):
else:
ref_cnt, alt_cnt = orig_ref_cnt, orig_alt_cnt

ref_fw_rv = valn.count_alleles(fwrv=True, estimated_count=False)[0]
alt_fw_rv = valn.count_alleles(fwrv=True, estimated_count=False)[1]
is_bidirectional = all(alt_fw_rv)
tot_fw = ref_fw_rv[0] + alt_fw_rv[0]
tot_rv = ref_fw_rv[1] + ref_fw_rv[1]

is_bidirectional = True
if min(tot_fw, tot_rv) >= 3:
is_bidirectional = all(alt_fw_rv)

return (
int(ref_cnt),
Expand Down Expand Up @@ -410,7 +416,7 @@ def mapping_features(target_indel, valn, bam, mapq):
# near exon
is_near_exon_boundaray = (
1
if most_common([is_near_boundary(read, target_indel) for read in target_reads])
if sum([is_near_boundary(read, target_indel) for read in target_reads])
else 0
)

Expand Down Expand Up @@ -527,7 +533,7 @@ def is_near_boundary(aligned_segment, target):
if not isinstance(idx, int):
return False

threshold = 2 if len(target.indel_seq) <= 2 else 3
threshold = 2 if len(target.indel_seq) <= 3 else 3

is_near = 0
if idx >= 2 and "N" in cigar_lst[idx - 2]:
Expand Down
4 changes: 3 additions & 1 deletion rnaindel/analysis/analyzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from functools import partial

from rnaindel.defaultcaller.defaultcaller import callindel
from rnaindel.defaultcaller.softclip_realigner import realn_softclips

from .preprocessor import preprocess
from .classifier import classify
Expand Down Expand Up @@ -36,7 +37,8 @@ def analyze(subcommand, version=None):

with tempfile.TemporaryDirectory() as tmp_dir:
callindel(bam, fasta, tmp_dir, args.heap_memory, region, n_processes)

realn_softclips(bam, fasta, tmp_dir, data_dir, region, n_processes, args.safety_mode)

df = preprocess(
tmp_dir,
fasta,
Expand Down
21 changes: 18 additions & 3 deletions rnaindel/analysis/callset_formatter.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,14 @@


def format_callset(tmp_dir, external_vcf, pass_only, region):

allchroms = get_outfile(tmp_dir)
if allchroms:
append_external_indels(
allchroms, external_vcf, pass_only, which_chrom=0, region=region
)

append_softclips(allchroms)

return allchroms

by_chroms = get_chrom_files(tmp_dir)
Expand All @@ -20,6 +22,9 @@ def format_callset(tmp_dir, external_vcf, pass_only, region):
append_external_indels(
each_chrom[0], external_vcf, pass_only, which_chrom=each_chrom[1]
)

append_softclips(each_chrom[0])

available_chroms.append(each_chrom[0])

return available_chroms
Expand All @@ -40,8 +45,19 @@ def get_chrom_files(tmp_dir):
)


def append_external_indels(filepath, external_vcf, pass_only, which_chrom, region=None):
def append_softclips(filepath):
sftclip_filepath = filepath[: len(filepath) - 3] + "sftclp.txt"

if os.path.isfile(sftclip_filepath):
df1 = pd.read_csv(filepath, sep="\t")
df2 = pd.read_csv(sftclip_filepath, sep="\t")

df = pd.concat([df1, df2], ignore_index=True, sort=False)

df.to_csv(filepath, sep="\t", index=False)


def append_external_indels(filepath, external_vcf, pass_only, which_chrom, region=None):
if os.stat(filepath).st_size == 0:
df1 = supply_empty_df()
else:
Expand Down Expand Up @@ -101,7 +117,6 @@ def is_chr_prefixed_vcf(external_vcf):


def update_data(data_lst, record, pass_only, max_indel_len=50):

if pass_only:
try:
if record.filter.__getitem__("PASS").record:
Expand Down
28 changes: 23 additions & 5 deletions rnaindel/analysis/postprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,12 +66,15 @@ def filter_str(row, non_somatic_db, pon, mapping_thresh):
pred = row["predicted_class"]

if pred == "somatic":
s = [
msg = [
filter_by_db(row, non_somatic_db, pon),
filter_by_mappability(row, mapping_thresh),
filter_by_bidirectionality(row),
filter_by_junction(row),
]
if any(s):
return ";".join(s).strip(";")
if any(msg):
msg = [_f for _f in msg if _f]
return ";".join(msg).strip(";")
else:
return "PASS"
else:
Expand Down Expand Up @@ -110,6 +113,20 @@ def filter_by_mappability(row, mapping_thresh):
return ""


def filter_by_bidirectionality(row):
if row["is_bidirectional"]:
return ""
else:
return "UnidirectionalSupport"


def filter_by_junction(row):
if row["is_near_boundary"]:
return "ImmediateProximityToJunciton"
else:
return ""


def reclassify_by_knowledge(row, cosmic):
if row["is_common"]:
return False
Expand All @@ -127,8 +144,9 @@ def reclassify_by_knowledge(row, cosmic):

def sort_positionally(df):
df["_chrom"] = df.apply(lambda x: x["chrom"].replace("chr", ""), axis=1)
df["_chrom"] = df.apply(lambda x: 23 if x["chrom"] == "X" else x["_chrom"], axis=1)
df["_chrom"] = df.apply(lambda x: 24 if x["chrom"] == "Y" else x["_chrom"], axis=1)
df["_chrom"] = df.apply(lambda x: 23 if x["_chrom"] == "X" else x["_chrom"], axis=1)
df["_chrom"] = df.apply(lambda x: 24 if x["_chrom"] == "Y" else x["_chrom"], axis=1)

df["_chrom"] = df.apply(lambda x: int(x["_chrom"]), axis=1)

df.sort_values(["_chrom", "cpos"], inplace=True)
Expand Down
2 changes: 2 additions & 0 deletions rnaindel/analysis/vcf_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,8 @@ def filter_fields():
'##FILTER=<ID=ProbableArtifactByPON,Description="Matches to user-defined non-somatic indel with the predicted artifact probability higher than the germline probability.">',
'##FILTER=<ID=ProbableGermlineByPON,Description="Matches to user-defined non-somatic indel with the predicted germline probability higher than the artifact probability">',
'##FILTER=<ID=LowMappabilityRegion,Description="More than half of reads covering the indel locus having MAPQ < unique mapping quality score.">',
'##FILTER=<ID=UnidirectionalSupport,Description="ALT allele is only supported by one read direction.">',
'##FILTER=<ID=ImmediateProximityToJunction,Description="Indel within exon, but near splice-junciton boundary. Possible mapping artifact">',
]
return h

Expand Down
2 changes: 1 addition & 1 deletion rnaindel/defaultcaller/defaultcaller.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ def callindel(bam, fasta, tmp_dir, heap_memory, region, num_of_processes):
base_cmd_str = (
"java -Xmx{} Ace2.SAMStreamingSNPFinder -fasta {} -min-mapq 1 "
"-optional-tags XT!=R -bam {} -tn T -min-quality 20 "
"-min-flanking-quality 20 -min-alt-allele-count 3 "
"-min-flanking-quality 20 -min-alt-allele-count 2 "
"-min-minor-frequency 0 -broad-min-quality 10 "
"-mmf-max-hq-mismatches 8 -mmf-max-hq-mismatches-xt-u 10 "
"-mmf-min-hq-quality 15 -mmf-max-lq-mismatches 8 "
Expand Down
Loading

0 comments on commit 1ff9c57

Please sign in to comment.