import os, sys
import matplotlib.pyplot as plt
from transposonmapper.statistics.volcano_helpers import apply_stats, info_from_datasets, make_datafile
[docs]def volcano(path_a, filelist_a, path_b, filelist_b, variable='read_per_gene', significance_threshold=0.01, normalize=True, trackgene_list=[], figure_title=""):
"""This script creates a volcanoplot to show the significance of fold change between two datasets.
It is based on this website:
- https://towardsdatascience.com/inferential-statistics-series-t-test-using-numpy-2718f8f9bf2f
- https://www.statisticshowto.com/independent-samples-t-test/
Code for showing gene name when hovering over datapoint is based on:
- https://stackoverflow.com/questions/7908636/possible-to-make-labels-appear-when-hovering-over-a-point-in-matplotlib
T-test is measuring the number of standard deviations our measured mean is from the baseline mean, while taking into
account that the standard deviation of the mean can change as we get more data
This creates a volcano plot that shows the fold change between two libraries and the corresponding p-values.
The fold change is determined by the mean of dataset b (experimental set) divided by the mean of dataset a (reference set).
The datasets can be of different length.
P-value is determined based on the student t-test (scipy.stats.ttest_ind).
NOTE:
The fold change is determined by the ratio between the reference and the experimental dataset.
When one of the datasets is 0, this is false results for the fold change.
To prevent this, the genes with 0 insertions are set to have 5 insertions, and the genes with 0 reads are set to have 25 reads.
These values are determined in dicussion with the Kornmann lab.
- Created on Tue Feb 16 14:06:48 2021
- @author: gregoryvanbeek
Parameters
----------
path_a : str
paths to location of the datafiles for library a
filelist_a : str
list of the names of the datafiles for library a located in path_a.
The type of file here is the pergene.txt file , which is one of the outputs from the transposonmapper function.
The format of the pergene file should be TAB separated and NOT COMMA separated. if you have
it as comma separated you can convert to tab separated using the command line with
this command: `cat oldfile.txt | tr '[,]' '[\t]' > newfile.txt`
path_b : str
paths to location of the datafiles for library b
filelist_b : str
list of the names of the datafiles for library b located in path_b
The type of file here is the pergene.txt file , which is one of the outputs from the transposonmapper function.
The format of the pergene file should be TAB separated and NOT COMMA separated. if you have
it as comma separated you can convert to tab separated using the command line with
this command: `cat oldfile.txt | tr '[,]' '[\t]' > newfile.txt`
variable : str, optional
tn_per_gene, read_per_gene or Nreadsperinsrt , by default 'read_per_gene'
significance_threshold : float, optional
Threshold value above which the fold change is regarded significant, only for plotting, by default 0.01
normalize : bool, optional
Whether to normalize variable. If set to True, each gene is normalized based on the total count in each dataset (i.e. each file in filelist_)
, by default True
trackgene_list : list, optional
Enter a list of gene name(s) which will be highlighted in the plot (e.g. ['cdc42', 'nrp1']), by default []
figure_title : str, optional
The title of the figure if not empty, by default ""
Returns
-------
dataframe
A dataframe containing:
- gene_names
- fold change
- t statistic
- p value
- whether p value is above threshold
figure
- volcanoplot with the log2 fold change between the two libraries and the -log10 p-value.
"""
### Making the whole datafile name
datafiles_list_a,datafiles_list_b=make_datafile(path_a,filelist_a,path_b,filelist_b)
### Extract information from datasets
variable_a_array,variable_b_array,volcano_df,tnread_gene_a,_=info_from_datasets(datafiles_list_a,datafiles_list_b,variable,normalize)
### APPLY stats.ttest_ind(A,B)
volcano_df=apply_stats(variable_a_array,variable_b_array,significance_threshold,volcano_df)
### Volcanoplot
print('Plotting: %s' % variable)
fig = plt.figure(figsize=(19.0,9.0))#(27.0,3))
grid = plt.GridSpec(1, 1, wspace=0.0, hspace=0.0)
ax = plt.subplot(grid[0,0])
colors = {False:'black', True:'red'} # based on p-value significance
sc = ax.scatter(x=volcano_df['fold_change'], y=volcano_df['p_value'], alpha=0.4, marker='.', c=volcano_df['significance'].apply(lambda x:colors[x]))
ax.grid(True, which='major', axis='both', alpha=0.4)
ax.set_xlabel('Log2 FC')
ax.set_ylabel('-1*Log10 p-value')
if not figure_title == "":
ax.set_title(variable + " - " + figure_title)
else:
ax.set_title(variable)
ax.scatter(x=[],y=[],marker='.',color='black', label='p-value > {}'.format(significance_threshold)) #set empty scatterplot for legend
ax.scatter(x=[],y=[],marker='.',color='red', label='p-value < {}'.format(significance_threshold)) #set empty scatterplot for legend
ax.legend()
if not trackgene_list == []:
genenames_array = volcano_df['gene_names'].to_numpy()
for trackgene in trackgene_list:
trackgene = trackgene.upper()
if trackgene in genenames_array:
trackgene_index = tnread_gene_a.loc[tnread_gene_a['gene_names'] == trackgene].index[0]
trackgene_annot = ax.annotate(volcano_df.iloc[trackgene_index,:]['gene_names'], (volcano_df.iloc[trackgene_index,:]['fold_change'], volcano_df.iloc[trackgene_index,:]['p_value']),
size=10, c='green', bbox=dict(boxstyle="round", fc="w"))
trackgene_annot.get_bbox_patch().set_alpha(0.6)
else:
print('WARNING: %s not found' % trackgene)
names = volcano_df['gene_names'].to_numpy()
annot = ax.annotate("", xy=(0,0), xytext=(20,20),textcoords="offset points",
bbox=dict(boxstyle="round", fc="w"),
arrowprops=dict(arrowstyle="->"))
annot.set_visible(False)
def update_annot(ind):
pos = sc.get_offsets()[ind["ind"][0]]
annot.xy = pos
# text = "{}, {}".format(" ".join(list(map(str,ind["ind"]))),
# " ".join([names[n] for n in ind["ind"]]))
text = "{}".format(" ".join([names[n] for n in ind["ind"]]))
annot.set_text(text)
# annot.get_bbox_patch().set_facecolor(cmap(norm(c[ind["ind"][0]])))
# annot.get_bbox_patch().set_alpha(0.4)
def hover(event):
vis = annot.get_visible()
if event.inaxes == ax:
cont, ind = sc.contains(event)
if cont:
update_annot(ind)
annot.set_visible(True)
fig.canvas.draw_idle()
else:
if vis:
annot.set_visible(False)
fig.canvas.draw_idle()
fig.canvas.mpl_connect("motion_notify_event", hover)
## return function
return(volcano_df)