import matplotlib.pyplot as plt
import numpy as np
import os
from scipy import stats
from transposonmapper.statistics import dataframe_from_pergenefile
[docs]def make_datafile(path_a,filelist_a,path_b,filelist_b):
"""Assembly the datafile name to analyze
Parameters
----------
path_a : str
Path of the files corresponding to the reference library
filelist_a : list of str
List of the filenames of the different replicates from the reference library.
It has to have minimum two replicates per library, so the list has to contain
a minimum of two files.
path_b : str
Path of the files corresponding to the experimental library
filelist_b : list of str
List of the filenames of the different replicates from the experimental library.
It has to have minimum two replicates per library, so the list has to contain
a minimum of two files.
Returns
-------
str
Complete paths of the reference and the experimental libraries
"""
datafiles_list_a = []
datafiles_list_b = []
for files in filelist_a:
datafile = os.path.join(path_a, files)
assert os.path.isfile(datafile), 'File not found at: %s' % datafile
datafiles_list_a.append(datafile)
for files in filelist_b:
datafile = os.path.join(path_b, files)
assert os.path.isfile(datafile), 'File not found at: %s' % datafile
datafiles_list_b.append(datafile)
return datafiles_list_a,datafiles_list_b
[docs]def info_from_datasets(datafiles_list_a,datafiles_list_b,variable,normalize):
"""Read the information contain in the datafiles for the volcano plot
Parameters
----------
datafiles_list_a : list of str
List of the absolute paths of all the replicates from the
reference library.
datafiles_list_b : list of str
List of the absolute paths of all the replicates from the
experimental library.
variable : str
Magnitude indicating based on what to make the volcano plot.
For example: tn_per_gene, read_per_gene or Nreadsperinsrt
normalize : bool
If True , If set to True, each gene is normalized based on
the total count in each dataset (i.e. each file in filelist_)
Returns
-------
variable_a_array : numpy.array
variable_b_array: numpy.array
volcano_df: pandas.core.frame.DataFrame
tnread_gene_a: pandas.core.frame.DataFrame
tnread_gene_b: pandas.core.frame.DataFrame
"""
tn_per_gene_zeroreplace = 5 #Add 5 insertions to every gene
read_per_gene_zeroreplace = 25 #Add 25 reads to every gene
# norm_a = 0
# norm_b = 0
for count, datafile_a in enumerate(datafiles_list_a):
tnread_gene_a = dataframe_from_pergenefile(datafile_a, verbose=False)
if normalize == True:
if variable == 'tn_per_gene':
norm_a = sum(tnread_gene_a.tn_per_gene)#*10**-4
elif variable == 'read_per_gene':
norm_a = sum(tnread_gene_a.read_per_gene)#*10**-7
elif variable == 'Nreadsperinsrt':
norm_a = sum(tnread_gene_a.Nreadsperinsrt)
#ADD A CONSTANT TO ALL VALUES TO PREVENT A ZERO DIVISION WHEN DETERMINING THE FOLD CHANGE.
tnread_gene_a.tn_per_gene = tnread_gene_a.tn_per_gene + tn_per_gene_zeroreplace
tnread_gene_a.read_per_gene = tnread_gene_a.read_per_gene + read_per_gene_zeroreplace
tnread_gene_a.Nreadsperinsrt = tnread_gene_a.Nreadsperinsrt + (read_per_gene_zeroreplace/tn_per_gene_zeroreplace)
if count == 0:
volcano_df = tnread_gene_a[['gene_names']] #initialize new dataframe with gene_names
if normalize == True:
variable_a_array = np.divide(tnread_gene_a[[variable]].to_numpy(), norm_a) #create numpy array to store normalized data
else:
variable_a_array = tnread_gene_a[[variable]].to_numpy() #create numpy array to store raw data
else:
if normalize == True:
variable_a_array = np.append(variable_a_array, np.divide(tnread_gene_a[[variable]].to_numpy(), norm_a), axis=1) #append normalized data
else:
variable_a_array = np.append(variable_a_array, tnread_gene_a[[variable]].to_numpy(), axis=1) #append raw data
for count, datafile_b in enumerate(datafiles_list_b):
tnread_gene_b = dataframe_from_pergenefile(datafile_b, verbose=False)
if normalize == True:
if variable == 'tn_per_gene':
norm_b = sum(tnread_gene_b.tn_per_gene)#*10**-4
elif variable == 'read_per_gene':
norm_b = sum(tnread_gene_b.read_per_gene)#*10**-7
elif variable == 'Nreadsperinsrt':
norm_b = sum(tnread_gene_b.Nreadsperinsrt)
#ADD A CONSTANT TO ALL VALUES TO PREVENT A ZERO DIVISION WHEN DETERMINING THE FOLD CHANGE.
tnread_gene_b.tn_per_gene = tnread_gene_b.tn_per_gene + tn_per_gene_zeroreplace
tnread_gene_b.read_per_gene = tnread_gene_b.read_per_gene + read_per_gene_zeroreplace
tnread_gene_b.Nreadsperinsrt = tnread_gene_b.Nreadsperinsrt + (read_per_gene_zeroreplace/tn_per_gene_zeroreplace)
if count == 0:
if normalize == True:
variable_b_array = np.divide(tnread_gene_b[[variable]].to_numpy(), norm_b)
else:
variable_b_array = tnread_gene_b[[variable]].to_numpy()
else:
if normalize == True:
variable_b_array = np.append(variable_b_array, np.divide(tnread_gene_b[[variable]].to_numpy(), norm_b), axis=1)
else:
variable_b_array = np.append(variable_b_array, tnread_gene_b[[variable]].to_numpy(), axis=1)
return variable_a_array,variable_b_array,volcano_df,tnread_gene_a,tnread_gene_b
[docs]def apply_stats(variable_a_array,variable_b_array,significance_threshold,volcano_df):
"""This function computes the statistics measure for the volcano plot
Parameters
----------
variable_a_array : array
The values (# of insertions or reads) of the replicates of one library
variable_b_array : array
The values (# of insertions or reads) of the replicates of the other library
significance_threshold : float
It will use the default value in the volcano function which is 0.01
Returns
-------
dataframe
A dataframe containing all the info for the volcano plot.
"""
ttest_tval_list = [np.nan]*len(variable_a_array) #initialize list for storing t statistics
ttest_pval_list = [np.nan]*len(variable_a_array) #initialize list for storing p-values
signif_thres_list = [False]*len(variable_a_array) #initialize boolean list for indicating datapoints with p-value above threshold
fc_list = [np.nan]*len(variable_a_array) #initialize list for storing fold changes
for count,val in enumerate(variable_a_array):
ttest_val = stats.ttest_ind(variable_a_array[count], variable_b_array[count]) #T-test
ttest_tval_list[count] = ttest_val[0]
if not ttest_val[1] == 0: #prevent p=0 to be inputted in log
ttest_pval_list[count] = -1*np.log10(ttest_val[1])
else:
ttest_pval_list[count] = 0
if ttest_pval_list[count] > -1*np.log10(significance_threshold):
signif_thres_list[count] = True
#DETERMINE FOLD CHANGE PER GENE
if np.mean(variable_b_array[count]) == 0 and np.mean(variable_a_array[count]) == 0:
fc_list[count] = 0
else:
fc_list[count] = np.log2(np.mean(variable_a_array[count]) / np.mean(variable_b_array[count]))
volcano_df['fold_change'] = fc_list
volcano_df['t_statistic'] = ttest_tval_list
volcano_df['p_value'] = ttest_pval_list
volcano_df['significance'] = signif_thres_list
return volcano_df