Find intervals that overlap a certain percentage and keep the longest one

Based on the sketch provided here: Here is a script that I used to evaluate overlaps two levels down, first looking for the longest interval in a merged region, and then looking at overlaps with intervals overlapping that longest interval.

This runs entirely within Python, no use of bedops or other command-line kits. This uses the ncls library, which has some Cython optimizations for fast interval overlap queries, along with PyRanges to do the first-pass merge operation.

Notes:

1. This is not recursive; some more complicated overlap arrangements might need a recursive approach, depending on how your overlap criteria must be applied.

2. This would need to be run on one chromosome's worth of data at a time, unless coordinates are first translated to an "absolute" coordinate scheme as described in a previous comment. The bedextract tool can be used to quickly split a BED file into separate chromosomes, e.g.,

for chr in `bedextract --list-chr in.bed`; do bedextract \${chr} in.bed > in.\${chr}.bed; done

Code:

#!/usr/bin/env python

import sys
import io
import click
import ncls
import pandas as pd
import pyranges as pr
import numpy as np
import collections

test_intervals_str=""'Chromosome  Start   End Id
chr1    50  80  F
chr1    100 120 D
chr1    100 140 C
chr1    120 200 A
chr1    199 260 B
'''

def df_from_intervals_str(str):
'''
In:
str - (string) TSV-formatted intervals with header

Out:
df - (Pandas dataframe)
'''
intervals = io.StringIO(str)

def ncl_from_df(df):
'''
In:
df - (Pandas dataframe)

Out:
df_as_ncl - (NCLS object)
'''
return ncls.NCLS(df['Start'], df['End'], df.index)

def ncl_all_overlaps(a, b):
'''
In:
a - (NCLS object) set A
b - (Pandas dataframe) set B

Out:
(l_idxs, r_idxs) - (tuple) indices of set A and set B which overlap
'''
return a.all_overlaps_both(b['Start'].values, b['End'].values, b.index.values)

def test_ncl_all_overlaps(a, b):
'''
In:
a - (NCLS object) set A
b - (Pandas dataframe) set B
'''
print(ncl_all_overlaps(a, b))

def ovr_idxs_to_map(ovr):
'''
In:
ovr - (tuple) output from ncl_all_overlaps

Out:
m - (OrderedDict) mapping of overlaps by indices
'''
m = collections.OrderedDict()
for l_idx, r_idx in zip(ovr, ovr):
if l_idx not in m:
m[l_idx] = []
m[l_idx].append(r_idx)
return m

def search_candidates(m, a, b, t):
'''
In:
m - (OrderedDict) mapping of overlaps by indices
a - (Pandas dataframe) set A (merged regions)
b - (Pandas dataframe) set B (input intervals)
t - (float) threshold for overlap

Accepted elements are written to standard output stream
'''
for ak, bv in m.items():
if len(bv) == 1:
# print disjoint element
b_row = b.iloc[[bv]]
sys.stdout.write('{}n'.format('t'.join([str(x) for x in b_row.values.flatten().tolist()])))
else:
# get set A's merged region length
a_row = a.iloc[[ak]]
a_len = a_row['End'].item() - a_row['Start'].item()
# iterate through set B to get the first, longest overlap
mo = 0
mi = -1
mv = -1
for i, v in enumerate(bv):
b_row = b.iloc[[v]]
co = (b_row['End'].item() - b_row['Start'].item()) / a_len
if co > mo:
mo = co
mi = i
mv = v
# if the longest element does not meet the overlap
if mo <= t:
continue
# otherwise, examine a list of candidates
candidate_idxs = bv.copy()
accepted_idxs = [mv]
candidate_idxs.pop(mi)
# mv is the index of the item in set B that we now test against the
# remaining elements in the candidate list
pv = mv
parent_df = b.iloc[[pv]]
children_df_as_ncl = ncl_from_df(b.iloc[candidate_idxs, :])
children_ovr_parent = ncl_all_overlaps(children_df_as_ncl, parent_df)
children_ovr_parent_map = ovr_idxs_to_map(children_ovr_parent)
# test overlaps of children with longest-overlapping parent
p_row = b.iloc[[pv]]
candidate_idxs_to_remove = []
for ci, cv in enumerate(candidate_idxs):
c_row = b.iloc[[cv]]
c_len = c_row['End'].item() - c_row['Start'].item()
# remove any candidates which do not overlap the parent -- these may
# have originally overlapped the merged regions we started with
if cv not in children_ovr_parent_map[pv]:
candidate_idxs_to_remove.append(ci)
continue
# measure overlap, per criteria in sketch
if (c_row['Start'].item() < p_row['Start'].item()) and (c_row['End'].item() < p_row['End'].item()):
l = p_row['Start'].item()
r = c_row['End'].item()
elif (c_row['Start'].item() < p_row['End'].item()) and (c_row['End'].item() > p_row['End'].item()):
l = c_row['Start'].item()
r = p_row['End'].item()
else:
# child element is nested within the parent
candidate_idxs_to_remove.append(ci)
continue
# calculate overlap, relative to child element
o = (r - l) / c_len
# if child element coverage is *less* than threshold, include it
if o < t:
accepted_idxs.append(cv)
# either way, we remove it from further consideration
candidate_idxs_to_remove.append(ci)
# make sure that we have no children left to test
# if we have any candidate children left, something went wrong
assert(len(candidate_idxs) == len(candidate_idxs_to_remove))
# print accepted elements
for acc_idx in accepted_idxs:
acc_row = b.iloc[[acc_idx]]
sys.stdout.write('{}n'.format('t'.join([str(x) for x in acc_row.values.flatten().tolist()])))

@click.command()
@click.option('--threshold', type=float, default=0.1, help='overlap threshold')
def main(threshold):
# validate parameter
assert((threshold > 0) and (threshold < 1))
# import data
df = df_from_intervals_str(test_intervals_str)
# validate input -- only one chromosome at a time
assert(len(df.Chromosome.unique()) == 1)
df_as_ncl = ncl_from_df(df)
gr = pr.from_string(test_intervals_str)
mf = gr.merge().as_df()
# convert column types to make compatible with ncls
mf['Start'] = mf['Start'].astype(np.int64)
mf['End'] = mf['End'].astype(np.int64)
# associate intervals with merged regions by indices (analogous to bedmap)
mf_ovr_df = ncl_all_overlaps(df_as_ncl, mf)
mf_ovr_df_map = ovr_idxs_to_map(mf_ovr_df)
# search through associations for those that meet overlap criteria
search_candidates(mf_ovr_df_map, mf, df, threshold)

if __name__ == '__main__':
main()

Some examples of running it with different thresholds:

\$ ./biostars9470750.py --threshold=0.1
chr1    50  80  F
chr1    120 200 A
chr1    199 260 B
\$ ./biostars9470750.py --threshold=0.5
chr1    50  80  F
\$ ./biostars9470750.py --threshold=0.01
chr1    50  80  F
chr1    120 200 A 