Source code for transposonmapper.statistics.dataframe_from_pergene_helpers

import os
import numpy as np
import re 

from transposonmapper.importing import load_default_files
from transposonmapper.processing import list_known_essentials

[docs]def read_pergene_file(pergenefile): """It reads the content of the pergene file , one of the outputs of Transposonmapper Parameters ---------- pergenefile : str absolute path to the pergene.txt file , one of the outputs of the transposonmapper module Returns ------- list Gene names list list Insertion list list Reads list """ assert os.path.isfile(pergenefile), 'File not found at: %s' % pergenefile with open(pergenefile) as f: lines = f.readlines()[1:] #skip header genenames_list = [np.nan]*len(lines) tnpergene_list = [np.nan]*len(lines) readpergene_list = [np.nan]*len(lines) line_counter = 0 for line in lines: line_split = re.split(' |\t', line.strip('\n')) l = [x for x in line_split if x] genenames_list[line_counter] = l[0] tnpergene_list[line_counter] = int(l[1]) readpergene_list[line_counter] = int(l[2]) line_counter += 1 return genenames_list,tnpergene_list,readpergene_list,lines
[docs]def reads_per_insertion(tnpergene_list,readpergene_list,lines): """It computes the reads per insertion following the formula: reads/(insertions-1) if the number of insertions is higher than 5, if not then the reads per insertion will be 0. Parameters ---------- tnpergene_list : list A list with all insertions readpergene_list : list A list of the reads lines : int Number of genes mapped to in the reference genome Returns ------- list A list containing all the reads per insertions per gene. """ readperinspergene_list = [np.nan]*len(lines) for i in range(len(tnpergene_list)): if not tnpergene_list[i] < 5: readperinspergene_list[i] = readpergene_list[i] / (tnpergene_list[i] -1) else: readperinspergene_list[i] = 0 return readperinspergene_list
[docs]def essential_genes(genenames_list,lines): """It provides a list of essential genes Parameters ---------- genenames_list : list A list will al genes names that were mapped to the reference genome lines : int Number of genes in total Returns ------- list List of essential genes """ _,essential_genes_list,_=load_default_files() known_essential_gene_list = list_known_essentials(essential_genes_list) geneessentiality_list = [None]*len(lines) for i in range(len(genenames_list)): if genenames_list[i] in known_essential_gene_list: geneessentiality_list[i] = True else: geneessentiality_list[i] = False return geneessentiality_list