I'm trying to calculate the GC3 content of tRNA encoding genes only. I have the gbff annotated genomes and the tabular versions, an example of which is here: www.ncbi.nlm.nih.gov/genome/browse/#!/proteins/665/300274%7CBacillus%20subtilis%20subsp.%20subtilis%20str.%20168/
I have already written a script that can extracts each gene of the whole genome, but how would I modify this to just the tRNA encoding genes using the table? I'm new to Python so any explanation would be great!
My function that pulls out the individual genes is below:
def extractinfo(file_path): with io.open(file_path, mode="r", encoding="utf-8") as file: # skips over the first line file.readline() # make an empty list SSL_list = list() # read through the whole file line by line for line in file.readlines(): # separates by comma, pulls out specific headings as a list SSL = line.split(",") # convert the number string to an integer start = int(SSL) stop = int(SSL) # replace double quotes with nothing (use single quotes to differentiate) locustag = SSL.replace('"', "") ID = SSL.replace('"', "") # adds start, stop and locus tags to the end of the empty list every time it loops SSL_list.append((start, stop, locustag, ID)) return SSL_list def extract_seq(file_path): with open(file_path) as file: genome_sequence = str() # True runs forever while True: # skips lines, if the line starts with origin, it stops, leaving us a line after line = file.readline() if line.startswith("ORIGIN"): break for line in file.readlines(): # replace spaces with nothing line = line.replace(" ", "") # starts at 0, continues until the end of the line (genome sequence) for i in range(0, len(line)): # checks if chararacters are actg if line[i] in "actg": genome_sequence += (line[i]) return genome_sequence end_file = "/Users/" def extract_genes(SSL_list, genome_sequence): for start, stop, locustag, ID in SSL_list: # extracts start:stop gene from the sequence gene_substring = genome_sequence[start:stop] # store in file with open(end_file + "https://www.biostars.org/" + ID + "+" + locustag + ".txt", "w") as file: file.write(gene_substring) work_dir = "/Users/" for path in glob.glob(os.path.join(work_dir, "*.csv")): SSL_list = extractinfo(path) txt_file = path.replace(".csv", ".gbff") sequences = extract_seq(txt_file) extract_genes(SSL_list, sequences) print(path)