Source code for transposonmapper.processing.profileplot_genome_helpers

from transposonmapper.processing.chromosome_names_in_files import chromosome_name_bedfile
from transposonmapper.properties.get_chromosome_position import chromosome_position

import numpy as np 

[docs]def summed_chr(chr_length_dict): """Create a dictionary where each value is the cumulative sum of all bp in each chromosomes Parameters ---------- chr_length_dict : dict A dictionary describing the length of each chromosome. Returns ------- dict A dictionary where each value corresponds to the cumulative sum of the previous chromosomes lengths. """ chrom_list = ['I', 'II', 'III', 'IV', 'V', 'VI', 'VII', 'VIII', 'IX', 'X', 'XI', 'XII', 'XIII', 'XIV', 'XV', 'XVI'] summed_chr_length_dict = {} summed_chr_length = 0 for c in chrom_list: summed_chr_length_dict[c] = summed_chr_length summed_chr_length += chr_length_dict.get(c) return summed_chr_length_dict
[docs]def length_genome(chr_length_dict): """Output the length of the genome in bp Parameters ---------- chr_length_dict : dict A dictionary describing the length of each chromosome. Returns ------- int The length of the genome """ chrom_list = ['I', 'II', 'III', 'IV', 'V', 'VI', 'VII', 'VIII', 'IX', 'X', 'XI', 'XII', 'XIII', 'XIV', 'XV', 'XVI'] l_genome = 0 for chrom in chrom_list: l_genome += int(chr_length_dict.get(chrom)) return l_genome
[docs]def middle_chrom_pos(chr_length_dict): """Defines the middle poit of each chromosome Parameters ---------- chr_length_dict : dict A dictionary describing the length of each chromosome. Returns ------- list A list describing for each chromosome the middle point. """ summed_chr_length_dict=summed_chr(chr_length_dict) l_genome=length_genome(chr_length_dict) middle_chr_position = [] c1 = summed_chr_length_dict.get('I') for c in summed_chr_length_dict: if not c == 'I': c2 = summed_chr_length_dict.get(c) middle_chr_position.append(c1 + (c2 - c1)/2) c1 = c2 c2 = l_genome middle_chr_position.append(c1 + (c2 - c1)/2) return middle_chr_position
[docs]def counts_genome(variable,bed_file,gff_file): """Counts of reads or the transposons per chromosomes Parameters ---------- variable : str "transposons" or "reads" bed_file : str absolute path of the location of the bedfile gff_file : str absolute path of the location of the gff file Returns ------- numpy.ndarray An array of the length of the genome with the counts of each variable per location in the genome. """ with open(bed_file) as f: lines = f.readlines() chrom_names_dict, chrom_start_index_dict, chrom_end_index_dict= chromosome_name_bedfile(bed_file) chr_length_dict, chr_start_pos_dict, chr_end_pos_dict = chromosome_position(gff_file) summed_chr_length_dict=summed_chr(chr_length_dict) l_genome=length_genome(chr_length_dict) allcounts_list = np.zeros(l_genome) if variable == "transposons": for line in lines[chrom_start_index_dict.get("I"):chrom_end_index_dict.get("XVI")+1]: line = line.strip('\n').split() chrom_name = [k for k,v in chrom_names_dict.items() if v == line[0].replace("chr",'')][0] allcounts_list[summed_chr_length_dict.get(chrom_name) + int(line[1])-1] += 1 elif variable == "reads": for line in lines[chrom_start_index_dict.get("I"):chrom_end_index_dict.get("XVI")+1]: line = line.strip('\n').split() chrom_name = [k for k,v in chrom_names_dict.items() if v == line[0].replace("chr",'')][0] allcounts_list[summed_chr_length_dict.get(chrom_name) + int(line[1])-1] += (int(line[4])-100)/20 return allcounts_list
[docs]def binned_list(allcounts_list,bar_width): """A binned list for a histogram of the counts Parameters ---------- allcounts_list : numpy.ndarray Output of the counts_genome function bar_width : float It could be a function of the length of the genome e.g. bar_width=l_genome/1000 Returns ------- list Binned list """ allcounts_binnedlist = [] val_counter = 0 sum_values = 0 for n in range(len(allcounts_list)): if int(val_counter % bar_width) != 0: sum_values += allcounts_list[n] elif int(val_counter % bar_width) == 0: allcounts_binnedlist.append(sum_values) sum_values = 0 val_counter += 1 allcounts_binnedlist.append(sum_values) return allcounts_binnedlist