gravatar for mmargolis27

4 hours ago by

Hello,

I'm trying to find the overlap where neither the 5' start NOR the 3' end coordinates match with each other based on a reference file and a gene prediction file. However, my nested for loop is giving me 8000+ results, when I need only ~80. I'm not sure how to find the exact start/stop positions either that match these conditionals. My input files are a .gb file and a .glimmer file.

def main():

#initialize variables for counts
gb_count = 0
glimmer_count = 0
exact_count = 0
five_prime_count = 0 
three_prime_count = 0
no_matches_count = 0

#protein_id list
protein_id = []

#initialize lists for start/stop coordinates
reference = []
prediction = []
prediction_start = []
prediction_end = []
reference_start = []
reference_end = []
three_prime_agree = []
five_prime_agree = []




#read in GeneBank file
for line in open('.gb file'):

    line = line.rstrip()

    if "protein_id=" in line:
        pro_id = line.split("=")
        pro_id = pro_id[1].replace('"','')
        protein_id.append(pro_id)

    elif "CDS" in line:
        if "join" in line:
            continue

        elif "/translation" in line:
            continue

        elif "P" in line:
            continue

        elif "complement" in line:
            value = " ".join(line.split()).replace('CDS','').replace("(",'').replace(")",'').split("complement")
            newValue = value[1].split("..")
            ref_start = newValue[1].replace('>','').replace("' '", '').replace('<','')
            reference_start.append(ref_start)
            ref_end = newValue[0]
            reference_end.append(ref_end)
            gb_count += 1


        else:
            test = " ".join(line.split()).replace('CDS','').split("..")
            ref_start = test[0].replace(">",'').replace(" ", '')
            reference_start.append(ref_start)
            ref_end = test[1]
            reference_end.append(ref_end)
            gb_count += 1
        reference.append({'refstart': ref_start, 'refend': ref_end})

# Read in Glimmer/Prediction file
for l in open('.glimmer file'):
    l = l.rstrip()

    #get header
    if l.startswith(">"):
        l = l.split("  ")
        seq = l[1].split(" ")
        pre_start = seq[0]
        prediction_start.append(pre_start)
        pre_end = seq[1]
        prediction_end.append(pre_end)
        glimmer_count += 1
        prediction.append({'predictionStart': pre_start, 'predictionEnd': pre_end})

    #3' overlap
 for pre in prediction:
     for ref in reference:
         if (ref['refend'] == pre['predictionEnd']) and (ref['refstart'] != pre['predictionStart']):
             three_prime_count += 1
             test_pre.append("3' Agree")
             print("So far at least 3' ends match", three_prime_count)

## Neither overlap
for pre in prediction:
for ref in reference:
if (ref['refend'] != pre['predictionEnd']) and (ref['refstart'] != pre['predictionStart']):
no_matches_count += 1
print("No matches", no_matches_count)



Source link