import os
import numpy as np
import matplotlib.pyplot as plt
from transposonmapper.properties.get_chromosome_position import chromosome_position
from transposonmapper.properties.get_gene_position import gene_position
from transposonmapper.processing.chromosome_names_in_files import chromosome_name_bedfile
from transposonmapper.processing.essential_genes_names import list_known_essentials
from transposonmapper.importing import load_default_files
from transposonmapper.plotting import profile_genome_plot
from transposonmapper.processing.profileplot_genome_helpers import (summed_chr,
length_genome,middle_chrom_pos,counts_genome,binned_list)
[docs]def profile_genome(bed_file=None, variable="transposons", bar_width=None, savefig=False,showfig=False):
"""Created on Thu Mar 18 13:05:39 2021
@author: gregoryvanbeek
This function creates a bar plot along the entire genome.
The height of each bar represents the number of transposons or reads at the genomic position indicated on the x-axis.
The bar_width determines how many basepairs are put in one bin. Little basepairs per bin may be slow.
Too many basepairs in one bin and possible low transposon areas might be obscured.
Parameters
----------
bed_file : str, optional
The file path to the location of the bed file in your filesystem, by default None
variable : str, optional
The variable for plotting throughput the genome, by default "transposons"
bar_width : int, optional
The width for the histogram of the plot, by default None , which means internally the length of the genome over 1000
savefig : bool, optional
Save the figure if True, by default False
showfig : bool, optional
Show the figure if True, by default False
Returns
-------
list
All insertion sites
list
Binned insertion sites according the width
"""
# If necessary, load default files
gff_file, essential_file, gene_name_file = load_default_files(
gff_file=None, essentials_file=None, gene_names_file=None
)
# Verify presence of files
data_files = {
"gff3": gff_file,
"essentials": essential_file,
"gene_names": gene_name_file,
}
for filetype, file_path in data_files.items():
assert file_path, f"{filetype} not found at {file_path}"
chrom_list = ['I', 'II', 'III', 'IV', 'V', 'VI', 'VII', 'VIII', 'IX', 'X', 'XI', 'XII', 'XIII', 'XIV', 'XV', 'XVI']
chr_length_dict, chr_start_pos_dict, chr_end_pos_dict = chromosome_position(gff_file)
summed_chr_length_dict=summed_chr(chr_length_dict)
l_genome=length_genome(chr_length_dict)
if bar_width == None:
bar_width = l_genome/1000
print('Genome length: ', l_genome)
middle_chr_position=middle_chrom_pos(chr_length_dict)
gene_pos_dict = gene_position(gff_file)
genes_currentchrom_pos_list = [k for k, v in gene_pos_dict.items()]
genes_essential_list = list_known_essentials(essential_file)
allcounts_list=counts_genome(variable,bed_file,gff_file)
allcounts_binnedlist=binned_list(allcounts_list,bar_width)
if bar_width == (l_genome/1000):
allinsertionsites_list = np.linspace(0,l_genome,int(l_genome/bar_width+1))
else:
allinsertionsites_list = np.linspace(0,l_genome,int(l_genome/bar_width+2))
##########Ploting##############
profile_genome_plot(bar_width,l_genome,allinsertionsites_list,allcounts_binnedlist,summed_chr_length_dict,
middle_chr_position,chrom_list,variable,genes_currentchrom_pos_list,gene_pos_dict)
# saving the plot
if savefig == True and variable == "transposons":
savepath = os.path.splitext(bed_file)
print('saving figure at %s' % savepath[0]+'_transposonplot_genome.png')
plt.savefig(savepath[0]+'_transposonplot_genome.png', dpi=400)
plt.close()
elif savefig == True and variable == "reads":
savepath = os.path.splitext(bed_file)
print('saving figure at %s' % savepath[0]+'_readplot_genome.png')
plt.savefig(savepath[0]+'_readplot_genome.png', dpi=400)
plt.close()
if showfig==True:
plt.show()
return allinsertionsites_list,allcounts_binnedlist