Source code for transposonmapper.statistics.volcano_helpers

import matplotlib.pyplot as plt
import numpy as np 
import os 
from scipy import stats

from transposonmapper.statistics import dataframe_from_pergenefile

[docs]def make_datafile(path_a,filelist_a,path_b,filelist_b):
    """Assembly the datafile name to analyze 

    Parameters
    ----------
    path_a : str
        Path of the files corresponding to the reference library
    filelist_a : list of str 
        List of the filenames of the different replicates from the reference library.
        It has to have minimum two replicates per library, so the list has to contain
        a minimum of two files. 
    path_b : str 
        Path of the files corresponding to the experimental library
    filelist_b : list of str
        List of the filenames of the different replicates from the experimental library.
        It has to have minimum two replicates per library, so the list has to contain
        a minimum of two files. 


    Returns
    -------
    str
        Complete paths of the reference and the experimental libraries 
    """
    datafiles_list_a = []
    datafiles_list_b = []
    for files in filelist_a:
        datafile = os.path.join(path_a, files)
        assert os.path.isfile(datafile), 'File not found at: %s' % datafile
        datafiles_list_a.append(datafile)
    for files in filelist_b:
        datafile = os.path.join(path_b, files)
        assert os.path.isfile(datafile), 'File not found at: %s' % datafile
        datafiles_list_b.append(datafile)
    return datafiles_list_a,datafiles_list_b


[docs]def info_from_datasets(datafiles_list_a,datafiles_list_b,variable,normalize):
    """Read the information contain in the datafiles for the volcano plot 

    Parameters
    ----------
    datafiles_list_a : list of str 
        List of the absolute paths of all the replicates from the 
        reference library. 
    datafiles_list_b : list of str 
        List of the absolute paths of all the replicates from the 
        experimental  library. 
    variable : str
        Magnitude indicating based on what to make the volcano plot. 
        For example: tn_per_gene, read_per_gene or Nreadsperinsrt 

    normalize : bool 
        If True , If set to True, each gene is normalized based on 
        the total count in each dataset (i.e. each file in filelist_)
        

    Returns
    -------
    variable_a_array : numpy.array
    variable_b_array: numpy.array
    volcano_df: pandas.core.frame.DataFrame
    tnread_gene_a: pandas.core.frame.DataFrame
    tnread_gene_b: pandas.core.frame.DataFrame
    """

    tn_per_gene_zeroreplace = 5 #Add 5 insertions to every gene
    read_per_gene_zeroreplace = 25 #Add 25 reads to every gene
    # norm_a = 0
    # norm_b = 0
    for count, datafile_a in enumerate(datafiles_list_a):
        tnread_gene_a = dataframe_from_pergenefile(datafile_a, verbose=False)
        if normalize == True:
            if variable == 'tn_per_gene':
                norm_a = sum(tnread_gene_a.tn_per_gene)#*10**-4
            elif variable == 'read_per_gene':
                norm_a = sum(tnread_gene_a.read_per_gene)#*10**-7
            elif variable == 'Nreadsperinsrt':
                norm_a = sum(tnread_gene_a.Nreadsperinsrt)

        #ADD A CONSTANT TO ALL VALUES TO PREVENT A ZERO DIVISION WHEN DETERMINING THE FOLD CHANGE.
        tnread_gene_a.tn_per_gene = tnread_gene_a.tn_per_gene + tn_per_gene_zeroreplace
        tnread_gene_a.read_per_gene = tnread_gene_a.read_per_gene + read_per_gene_zeroreplace
        tnread_gene_a.Nreadsperinsrt = tnread_gene_a.Nreadsperinsrt + (read_per_gene_zeroreplace/tn_per_gene_zeroreplace)

        if count == 0:
            volcano_df = tnread_gene_a[['gene_names']] #initialize new dataframe with gene_names
            if normalize == True:
                variable_a_array = np.divide(tnread_gene_a[[variable]].to_numpy(), norm_a) #create numpy array to store normalized data
            else:
                variable_a_array = tnread_gene_a[[variable]].to_numpy() #create numpy array to store raw data
        else: 
            if normalize == True:
                variable_a_array = np.append(variable_a_array, np.divide(tnread_gene_a[[variable]].to_numpy(), norm_a), axis=1) #append normalized data
            else:
                variable_a_array = np.append(variable_a_array, tnread_gene_a[[variable]].to_numpy(), axis=1) #append raw data


    for count, datafile_b in enumerate(datafiles_list_b):
        tnread_gene_b = dataframe_from_pergenefile(datafile_b, verbose=False)
        if normalize == True:
            if variable == 'tn_per_gene':
                norm_b = sum(tnread_gene_b.tn_per_gene)#*10**-4
            elif variable == 'read_per_gene':
                norm_b = sum(tnread_gene_b.read_per_gene)#*10**-7
            elif variable == 'Nreadsperinsrt':
                norm_b = sum(tnread_gene_b.Nreadsperinsrt)

        #ADD A CONSTANT TO ALL VALUES TO PREVENT A ZERO DIVISION WHEN DETERMINING THE FOLD CHANGE.
        tnread_gene_b.tn_per_gene = tnread_gene_b.tn_per_gene + tn_per_gene_zeroreplace
        tnread_gene_b.read_per_gene = tnread_gene_b.read_per_gene + read_per_gene_zeroreplace
        tnread_gene_b.Nreadsperinsrt = tnread_gene_b.Nreadsperinsrt + (read_per_gene_zeroreplace/tn_per_gene_zeroreplace)

        if count == 0:
            if normalize == True:
                variable_b_array = np.divide(tnread_gene_b[[variable]].to_numpy(), norm_b)
            else:
                variable_b_array = tnread_gene_b[[variable]].to_numpy()
        else:
            if normalize == True:
                variable_b_array = np.append(variable_b_array, np.divide(tnread_gene_b[[variable]].to_numpy(), norm_b), axis=1)
            else:
                variable_b_array = np.append(variable_b_array, tnread_gene_b[[variable]].to_numpy(), axis=1)

    return variable_a_array,variable_b_array,volcano_df,tnread_gene_a,tnread_gene_b


[docs]def apply_stats(variable_a_array,variable_b_array,significance_threshold,volcano_df):
    """This function computes the statistics measure for the volcano plot 

    Parameters
    ----------
    variable_a_array : array
        The values (# of insertions or reads) of the replicates of one library
    variable_b_array : array 
        The values (# of insertions or reads) of the  replicates of the other library
    significance_threshold : float 
        It will use the default value in the volcano function which is 0.01

    Returns
    -------
    dataframe 
        A dataframe containing all the info for the volcano plot. 
    """
    
    ttest_tval_list = [np.nan]*len(variable_a_array) #initialize list for storing t statistics
    ttest_pval_list = [np.nan]*len(variable_a_array) #initialize list for storing p-values
    signif_thres_list = [False]*len(variable_a_array) #initialize boolean list for indicating datapoints with p-value above threshold
    fc_list = [np.nan]*len(variable_a_array) #initialize list for storing fold changes

    for count,val  in enumerate(variable_a_array):

        ttest_val = stats.ttest_ind(variable_a_array[count], variable_b_array[count]) #T-test
        ttest_tval_list[count] = ttest_val[0]
        if not ttest_val[1] == 0: #prevent p=0 to be inputted in log
            ttest_pval_list[count] = -1*np.log10(ttest_val[1])
        else:
            ttest_pval_list[count] = 0
        if ttest_pval_list[count] > -1*np.log10(significance_threshold):
            signif_thres_list[count] = True

    #DETERMINE FOLD CHANGE PER GENE
        if np.mean(variable_b_array[count]) == 0 and np.mean(variable_a_array[count]) == 0:
            fc_list[count] = 0
        else:
            fc_list[count] = np.log2(np.mean(variable_a_array[count]) / np.mean(variable_b_array[count]))

    
    volcano_df['fold_change'] = fc_list
    volcano_df['t_statistic'] = ttest_tval_list
    volcano_df['p_value'] = ttest_pval_list
    volcano_df['significance'] = signif_thres_list

    return volcano_df
SATAY pipeline at Delft :)

Source code for transposonmapper.statistics.volcano_helpers