Based on the sketch provided here:

Example

Here is a script that I used to evaluate overlaps two levels down, first looking for the longest interval in a merged region, and then looking at overlaps with intervals overlapping that longest interval.

This runs entirely within Python, no use of bedops or other command-line kits. This uses the ncls library, which has some Cython optimizations for fast interval overlap queries, along with PyRanges to do the first-pass merge operation.

Notes:

  1. This is not recursive; some more complicated overlap arrangements might need a recursive approach, depending on how your overlap criteria must be applied.

  2. This would need to be run on one chromosome's worth of data at a time, unless coordinates are first translated to an "absolute" coordinate scheme as described in a previous comment. The bedextract tool can be used to quickly split a BED file into separate chromosomes, e.g.,

     for chr in `bedextract --list-chr in.bed`; do bedextract ${chr} in.bed > in.${chr}.bed; done
    

Code:

#!/usr/bin/env python

import sys
import io
import click
import ncls
import pandas as pd
import pyranges as pr
import numpy as np
import collections

test_intervals_str=""'Chromosome  Start   End Id
chr1    50  80  F
chr1    100 120 D
chr1    100 140 C
chr1    120 200 A
chr1    199 260 B
'''

def df_from_intervals_str(str):
    '''
    In:
    str - (string) TSV-formatted intervals with header

    Out:
    df - (Pandas dataframe)
    '''
    intervals = io.StringIO(str)
    return pd.read_csv(intervals, sep='t', header=0)

def ncl_from_df(df):
    '''
    In:
    df - (Pandas dataframe)

    Out:
    df_as_ncl - (NCLS object)
    '''
    return ncls.NCLS(df['Start'], df['End'], df.index)

def ncl_all_overlaps(a, b):
    '''
    In:
    a - (NCLS object) set A
    b - (Pandas dataframe) set B

    Out:
    (l_idxs, r_idxs) - (tuple) indices of set A and set B which overlap
    '''
    return a.all_overlaps_both(b['Start'].values, b['End'].values, b.index.values) 

def test_ncl_all_overlaps(a, b):
    '''
    In:
    a - (NCLS object) set A
    b - (Pandas dataframe) set B
    '''
    print(ncl_all_overlaps(a, b))

def ovr_idxs_to_map(ovr):
    '''
    In:
    ovr - (tuple) output from ncl_all_overlaps

    Out:
    m - (OrderedDict) mapping of overlaps by indices
    '''
    m = collections.OrderedDict()
    for l_idx, r_idx in zip(ovr[0], ovr[1]):
        if l_idx not in m:
            m[l_idx] = []
        m[l_idx].append(r_idx)
    return m

def search_candidates(m, a, b, t):
    '''
    In:
    m - (OrderedDict) mapping of overlaps by indices
    a - (Pandas dataframe) set A (merged regions)
    b - (Pandas dataframe) set B (input intervals)
    t - (float) threshold for overlap

    Accepted elements are written to standard output stream
    '''
    for ak, bv in m.items():
        if len(bv) == 1:
            # print disjoint element
            b_row = b.iloc[[bv[0]]]
            sys.stdout.write('{}n'.format('t'.join([str(x) for x in b_row.values.flatten().tolist()])))
        else:
            # get set A's merged region length
            a_row = a.iloc[[ak]]
            a_len = a_row['End'].item() - a_row['Start'].item()
            # iterate through set B to get the first, longest overlap
            mo = 0
            mi = -1
            mv = -1
            for i, v in enumerate(bv):
                b_row = b.iloc[[v]]
                co = (b_row['End'].item() - b_row['Start'].item()) / a_len
                if co > mo:
                    mo = co
                    mi = i
                    mv = v
            # if the longest element does not meet the overlap 
            # threshold, we skip to the next merged region
            if mo <= t:
                continue
            # otherwise, examine a list of candidates
            candidate_idxs = bv.copy()
            accepted_idxs = [mv]
            candidate_idxs.pop(mi)
            # mv is the index of the item in set B that we now test against the 
            # remaining elements in the candidate list
            pv = mv
            parent_df = b.iloc[[pv]]
            children_df_as_ncl = ncl_from_df(b.iloc[candidate_idxs, :])
            children_ovr_parent = ncl_all_overlaps(children_df_as_ncl, parent_df)
            children_ovr_parent_map = ovr_idxs_to_map(children_ovr_parent)
            # test overlaps of children with longest-overlapping parent
            p_row = b.iloc[[pv]]
            candidate_idxs_to_remove = []
            for ci, cv in enumerate(candidate_idxs):
                c_row = b.iloc[[cv]]
                c_len = c_row['End'].item() - c_row['Start'].item()
                # remove any candidates which do not overlap the parent -- these may 
                # have originally overlapped the merged regions we started with
                if cv not in children_ovr_parent_map[pv]:
                    candidate_idxs_to_remove.append(ci)
                    continue
                # measure overlap, per criteria in sketch
                if (c_row['Start'].item() < p_row['Start'].item()) and (c_row['End'].item() < p_row['End'].item()):
                    l = p_row['Start'].item()
                    r = c_row['End'].item()
                elif (c_row['Start'].item() < p_row['End'].item()) and (c_row['End'].item() > p_row['End'].item()):
                    l = c_row['Start'].item()
                    r = p_row['End'].item()
                else:
                    # child element is nested within the parent
                    candidate_idxs_to_remove.append(ci)
                    continue
                # calculate overlap, relative to child element
                o = (r - l) / c_len
                # if child element coverage is *less* than threshold, include it
                if o < t:
                    accepted_idxs.append(cv)
                # either way, we remove it from further consideration
                candidate_idxs_to_remove.append(ci)
            # make sure that we have no children left to test
            # if we have any candidate children left, something went wrong
            assert(len(candidate_idxs) == len(candidate_idxs_to_remove))
            # print accepted elements
            for acc_idx in accepted_idxs:
                acc_row = b.iloc[[acc_idx]]
                sys.stdout.write('{}n'.format('t'.join([str(x) for x in acc_row.values.flatten().tolist()])))

@click.command()
@click.option('--threshold', type=float, default=0.1, help='overlap threshold')
def main(threshold):
    # validate parameter
    assert((threshold > 0) and (threshold < 1))
    # import data
    df = df_from_intervals_str(test_intervals_str)
    # validate input -- only one chromosome at a time
    assert(len(df.Chromosome.unique()) == 1)
    df_as_ncl = ncl_from_df(df)
    gr = pr.from_string(test_intervals_str)
    mf = gr.merge().as_df()
    # convert column types to make compatible with ncls
    mf['Start'] = mf['Start'].astype(np.int64)
    mf['End'] = mf['End'].astype(np.int64)
    # associate intervals with merged regions by indices (analogous to bedmap)
    mf_ovr_df = ncl_all_overlaps(df_as_ncl, mf)
    mf_ovr_df_map = ovr_idxs_to_map(mf_ovr_df)
    # search through associations for those that meet overlap criteria
    search_candidates(mf_ovr_df_map, mf, df, threshold)

if __name__ == '__main__':
    main()

Some examples of running it with different thresholds:

$ ./biostars9470750.py --threshold=0.1
chr1    50  80  F
chr1    120 200 A
chr1    199 260 B
$ ./biostars9470750.py --threshold=0.5
chr1    50  80  F
$ ./biostars9470750.py --threshold=0.01
chr1    50  80  F
chr1    120 200 A



Source link