Source code for transposonmapper.processing.genomicfeatures_dataframe

import os, sys
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import numpy as np 
import pkg_resources


from transposonmapper.properties import chromosome_position,gene_aliases
from transposonmapper.processing import chromosome_name_wigfile

from transposonmapper.processing.read_sgdfeatures import sgd_features


from transposonmapper.importing import (
    load_default_files, load_sgd_tab
)


from transposonmapper.processing.dna_features_helpers import (build_dataframe, input_region, intergenic_regions, read_pergene_file, 
                                                              read_wig_file,gene_location,checking_features)
from transposonmapper.plotting.dna_features_plot import dna_features_plot

[docs]def dna_features(region, wig_file, pergene_insertions_file, variable="reads", plotting=True, savefigure=False, verbose=True):
    """This scripts takes a user defined genomic region (i.e. chromosome number, region or gene) and creates a dataframe including information about all genomic features in the chromosome (i.e. genes, nc-DNA etc.).
    This can be used to determine the number of reads outside the genes to use this for normalization of the number of reads in the genes.
    Output is a dataframe including major information about all genomic features and optionally a barplot indicating the number of transposons per genomic region.
    A genomic region is here defined as a gene (separated as annotated essential and not essential), telomere, centromere, ars etc.
    This can be used for identifying neutral regions (i.e. genomic regions that, if inhibited, do not influence the fitness of the cells).
    This function can be used for normalizing the transposon insertions per gene using the neutral regions.
    
              
    
    Parameters
    ----------
    region : str
        - Region: e.g. chromosome number (either a normal number between 1 and 16 or in roman numerals between I and XVI), a list like ['V', 0, 14790] which creates a barplot between basepair 0 and 14790) or a genename.

    wig_file : str
        absolute path for the wig file location
    pergene_insertions_file : str 
        asbsoulte path for the _pergene_insertions.txt file location 
    variable : str, optional
        By default "reads". It could be "transposons"or "reads". This would be used for the plotting if True 
    plotting : bool, optional
        Whether or not producing a bar plot with the reads/insertions per genomic location in the region, by default True
    savefigure : bool, optional
        Whether or not saving the plot in the same folder as the datafiles, by default False
    verbose : bool, optional
        Determines how much textual feedback is given. When set to False, only warnings will be shown. By default True

    Returns
    -------
    dataframe
         Dataframe containing information about the selected chromosome. 

    
    """

    # If necessary, load default files
    gff_file, essentials_file, gene_information_file = load_default_files()
    sgd_features_file=load_sgd_tab()

    # Verify presence of files
    data_files = {
        "gff3": gff_file,
        "essentials": essentials_file,
        "gene_names": gene_information_file,
        "sgd_features": sgd_features_file
    }

    for filetype, file_path in data_files.items():
        assert file_path, f"{filetype} not found at {file_path}"


    variable = variable.lower()
    if plotting == True:
        create_plottitle = ''

# DETERMINE INPUTTED REGION

    roi_start,roi_end,region_type,chrom=input_region(region=region,verbose=verbose)

    

#READ WIG FILE FOR GETTING LOCATIONS OF ALL TN INSERTIONS

    insrt_in_chrom_list,reads_in_chrom_list=read_wig_file(wig_file=wig_file,chrom=chrom)


# READ PERGENE_INSERTIONS FILE FOR LOCATION OF ALL INSERTIONS PER EACH GENE.

    gene_position_dict=read_pergene_file(pergene_insertions_file=pergene_insertions_file,chrom=chrom)

# DETERMINE THE LOCATION GENOMIC FEATURES IN THE CURRENT CHROMOSOME AND STORE THIS IN A DICTIONARY

    dna_dict,start_chr,end_chr,len_chr,feature_orf_dict=gene_location(chrom,gene_position_dict,verbose)

## GET FEATURES FROM INTERGENIC REGIONS 

    dna_dict_new,genomicregions_list=intergenic_regions(chrom,start_chr,dna_dict)


    ### TEST IF ELEMENTS IN FEATURE_ORF_DICT FOR SELECTED CHROMOSOME ARE THE SAME AS THE GENES IN GENE_POSITION_DICT BY CREATING THE DICTIONARY FEATURE_POSITION_DICT CONTAINING ALL THE GENES IN FEATURE_ORF_DICT WITH THEIR CORRESPONDING POSITION IN THE CHROMOSOME
    checking_features(feature_orf_dict,chrom,gene_position_dict,verbose)

    dna_df2=build_dataframe(dna_dict_new,start_chr,end_chr,insrt_in_chrom_list,reads_in_chrom_list,genomicregions_list,chrom)

    #PRINT INFORMATION FOR THE SELECTED GENE
    if region_type == 'Gene':
        for region_info in dna_df2.itertuples():
            if region_info.Feature_name == region.upper() or region_info.Standard_name == region.upper():
                print(region_info)

    
    # CREATE BAR PLOT 
    dna_features_plot(chrom, dna_df2,roi_start,roi_end,len_chr,plotting,variable)

    if savefigure == True:
        file_dirname=pkg_resources.resource_filename("transposonmapper", "data_files/")
        if variable == 'reads':
            saving_name = os.path.join(file_dirname,'GenomicFeaturesReads_Barplot_Chrom'+chrom+'_NonNormalized')
        else:
            saving_name = os.path.join(file_dirname,'GenomicFeaturesInsertions_Barplot_Chrom'+chrom+'_NonNormalized')
        plt.savefig(saving_name, orientation='landscape', dpi=200)
        plt.close()

    return(dna_df2)
SATAY pipeline at Delft :)

Source code for transposonmapper.processing.genomicfeatures_dataframe